// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package access
import (
"bytes"
"context"
"fmt"
"io"
"net/http"
"runtime"
"sort"
"sync/atomic"
"time"
"github.com/hashicorp/consul/api"
"gopkg.in/natefinch/lumberjack.v2"
errcode "github.com/cubefs/cubefs/blobstore/common/errors"
"github.com/cubefs/cubefs/blobstore/common/proto"
"github.com/cubefs/cubefs/blobstore/common/resourcepool"
"github.com/cubefs/cubefs/blobstore/common/rpc"
"github.com/cubefs/cubefs/blobstore/common/trace"
"github.com/cubefs/cubefs/blobstore/util/defaulter"
"github.com/cubefs/cubefs/blobstore/util/log"
"github.com/cubefs/cubefs/blobstore/util/retry"
"github.com/cubefs/cubefs/blobstore/util/task"
)
const (
defaultMaxSizePutOnce int64 = 1 << 28 // 256MB
defaultMaxPartRetry int = 3
defaultMaxHostRetry int = 3
defaultPartConcurrence int = 4
defaultServiceInterval int = 3600 // one hour.
defaultServiceName = "access"
)
// RPCConnectMode self-defined rpc client connection config setting
type RPCConnectMode uint8
// timeout: [short - - - - - - - - -> long]
// quick --> general --> default --> slow --> nolimit
// speed: 40MB --> 20MB --> 10MB --> 4MB --> nolimit
const (
DefaultConnMode RPCConnectMode = iota
QuickConnMode
GeneralConnMode
SlowConnMode
NoLimitConnMode
)
func (mode RPCConnectMode) getConfig(speed float64, timeout, baseTimeout int64) rpc.Config {
getSpeed := func(defaultVal float64) float64 {
if speed > 0 {
return speed
}
return defaultVal
}
getBaseTimeout := func(defaultVal int64) int64 {
if baseTimeout > 0 {
return baseTimeout
}
return defaultVal
}
getTimeout := func(speed float64) int64 {
if timeout > 0 {
return timeout
}
return 5 * (1 << 30) * 1e3 / int64(speed*(1<<20))
}
config := rpc.Config{
// the whole request and response timeout
ClientTimeoutMs: getTimeout(getSpeed(10)),
BodyBandwidthMBPs: getSpeed(10),
BodyBaseTimeoutMs: getBaseTimeout(30 * 1000),
Tc: rpc.TransportConfig{
// dial timeout
DialTimeoutMs: 5 * 1000,
// response header timeout after send the request
ResponseHeaderTimeoutMs: 5 * 1000,
// IdleConnTimeout is the maximum amount of time an idle
// (keep-alive) connection will remain idle before closing
// itself.Zero means no limit.
IdleConnTimeoutMs: 30 * 1000,
MaxIdleConns: 0,
MaxConnsPerHost: 2048,
MaxIdleConnsPerHost: 1024,
DisableCompression: true,
},
}
switch mode {
case QuickConnMode:
config.ClientTimeoutMs = getTimeout(getSpeed(40))
config.BodyBandwidthMBPs = getSpeed(40)
config.BodyBaseTimeoutMs = getBaseTimeout(3 * 1000)
config.Tc.DialTimeoutMs = 2 * 1000
config.Tc.ResponseHeaderTimeoutMs = 2 * 1000
config.Tc.IdleConnTimeoutMs = 10 * 1000
case GeneralConnMode:
config.ClientTimeoutMs = getTimeout(getSpeed(20))
config.BodyBandwidthMBPs = getSpeed(20)
config.BodyBaseTimeoutMs = getBaseTimeout(10 * 1000)
config.Tc.DialTimeoutMs = 3 * 1000
config.Tc.ResponseHeaderTimeoutMs = 3 * 1000
config.Tc.IdleConnTimeoutMs = 30 * 1000
case SlowConnMode:
config.ClientTimeoutMs = getTimeout(getSpeed(4))
config.BodyBandwidthMBPs = getSpeed(4)
config.BodyBaseTimeoutMs = getBaseTimeout(120 * 1000)
config.Tc.DialTimeoutMs = 10 * 1000
config.Tc.ResponseHeaderTimeoutMs = 10 * 1000
config.Tc.IdleConnTimeoutMs = 60 * 1000
case NoLimitConnMode:
config.ClientTimeoutMs = 0
config.BodyBandwidthMBPs = getSpeed(0)
config.BodyBaseTimeoutMs = getBaseTimeout(0)
config.Tc.DialTimeoutMs = 0
config.Tc.ResponseHeaderTimeoutMs = 0
config.Tc.IdleConnTimeoutMs = 600 * 1000
default:
}
return config
}
// Config access client config
type Config struct {
// ConnMode rpc connection timeout setting
ConnMode RPCConnectMode
// ClientTimeoutMs the whole request and response timeout
ClientTimeoutMs int64
// BodyBandwidthMBPs reading body timeout, request or response
// timeout = ContentLength/BodyBandwidthMBPs + BodyBaseTimeoutMs
BodyBandwidthMBPs float64
// BodyBaseTimeoutMs base timeout for read body
BodyBaseTimeoutMs int64
// Consul is consul config for discovering service
Consul ConsulConfig
// ServiceIntervalS is interval seconds for discovering service
ServiceIntervalS int
// PriorityAddrs priority addrs of access service when retry
PriorityAddrs []string
// MaxSizePutOnce max size using once-put object interface
MaxSizePutOnce int64
// MaxPartRetry max retry times when putting one part, 0 means forever
MaxPartRetry int
// MaxHostRetry max retry hosts of access service
MaxHostRetry int
// PartConcurrence concurrence of put parts
PartConcurrence int
// rpc selector config
// Failure retry interval, default value is -1, if FailRetryIntervalS < 0,
// remove failed hosts will not work.
FailRetryIntervalS int
// Within MaxFailsPeriodS, if the number of failures is greater than or equal to MaxFails,
// the host is considered disconnected.
MaxFailsPeriodS int
// HostTryTimes Number of host failure retries
HostTryTimes int
// RPCConfig user-defined rpc config
// All connections will use the config if it's not nil
// ConnMode will be ignored if rpc config is setting
RPCConfig *rpc.Config
// LogLevel client output logging level.
LogLevel log.Level
// Logger trace all logging to the logger if setting.
// It is an io.WriteCloser that writes to the specified filename.
// YOU should CLOSE it after you do not use the client anymore.
Logger *Logger
}
// ConsulConfig alias of consul api.Config
// Fixup: client and sdk using the same config type
type ConsulConfig = api.Config
// Logger alias of lumberjack Logger
// See more at: https://github.com/natefinch/lumberjack
type Logger = lumberjack.Logger
// client access rpc client
type client struct {
config Config
rpcClient atomic.Value
stop chan struct{}
}
// API access api for s3
// To trace request id, the ctx is better WithRequestID(ctx, rid).
type API interface {
// Put object once if size is not greater than MaxSizePutOnce, otherwise put blobs one by one.
// return a location and map of hash summary bytes you excepted.
//
// If PutArgs' body is of type *bytes.Buffer, *bytes.Reader, or *strings.Reader,
// GetBody is populated, then the Put once request has retry ability.
Put(ctx context.Context, args *PutArgs) (location Location, hashSumMap HashSumMap, err error)
// Get object, range is supported.
Get(ctx context.Context, args *GetArgs) (body io.ReadCloser, err error)
// Delete all blobs in these locations.
// return failed locations which have yet been deleted if error is not nil.
Delete(ctx context.Context, args *DeleteArgs) (failedLocations []Location, err error)
}
var _ API = (*client)(nil)
type noopBody struct{}
var _ io.ReadCloser = (*noopBody)(nil)
func (rc noopBody) Read(p []byte) (n int, err error) { return 0, io.EOF }
func (rc noopBody) Close() error { return nil }
var memPool *resourcepool.MemPool
func init() {
memPool = resourcepool.NewMemPool(map[int]int{
1 << 12: -1,
1 << 14: -1,
1 << 18: -1,
1 << 20: -1,
1 << 22: -1,
1 << 23: -1,
1 << 24: -1,
})
}
// New returns an access API
func New(cfg Config) (API, error) {
defaulter.LessOrEqual(&cfg.MaxSizePutOnce, defaultMaxSizePutOnce)
defaulter.Less(&cfg.MaxPartRetry, defaultMaxPartRetry)
defaulter.LessOrEqual(&cfg.MaxHostRetry, defaultMaxHostRetry)
defaulter.LessOrEqual(&cfg.PartConcurrence, defaultPartConcurrence)
if cfg.ServiceIntervalS < 300 { // at least 5 minutes
cfg.ServiceIntervalS = defaultServiceInterval
}
log.SetOutputLevel(cfg.LogLevel)
if cfg.Logger != nil {
log.SetOutput(cfg.Logger)
}
c := &client{
config: cfg,
stop: make(chan struct{}),
}
runtime.SetFinalizer(c, func(c *client) {
rpcClient, ok := c.rpcClient.Load().(rpc.Client)
if ok {
rpcClient.Close()
}
close(c.stop)
})
if cfg.Consul.Address == "" {
if len(cfg.PriorityAddrs) < 1 {
return nil, errcode.ErrAccessServiceDiscovery
}
c.rpcClient.Store(getClient(&cfg, cfg.PriorityAddrs))
return c, nil
}
consulConfig := cfg.Consul
consulClient, err := api.NewClient(&consulConfig)
if err != nil {
return nil, errcode.ErrAccessServiceDiscovery
}
first := true
serviceName := defaultServiceName
hostGetter := func() ([]string, error) {
if first && len(cfg.PriorityAddrs) > 0 {
hosts := make([]string, len(cfg.PriorityAddrs))
copy(hosts, cfg.PriorityAddrs[:])
first = false
return hosts, nil
}
services, _, err := consulClient.Health().Service(serviceName, "", true, nil)
if err != nil {
return nil, err
}
hosts := make([]string, 0, len(services))
for _, s := range services {
address := s.Service.Address
if address == "" {
address = s.Node.Address
}
hosts = append(hosts, fmt.Sprintf("http://%s:%d", address, s.Service.Port))
}
if len(hosts) == 0 {
return nil, fmt.Errorf("unavailable service")
}
return hosts, nil
}
hosts, err := hostGetter()
if err != nil {
log.Errorf("get hosts from consul failed: %v", err)
return nil, errcode.ErrAccessServiceDiscovery
}
c.rpcClient.Store(getClient(&cfg, hosts))
ticker := time.NewTicker(time.Duration(cfg.ServiceIntervalS) * time.Second)
go func() {
for {
old := hosts
select {
case <-ticker.C:
hosts, err = hostGetter()
if err != nil {
log.Warnf("update hosts from consul failed: %v", err)
continue
}
if isUpdated(old, hosts) {
oldClient, ok := c.rpcClient.Load().(rpc.Client)
if ok && oldClient != nil {
oldClient.Close()
}
c.rpcClient.Store(getClient(&cfg, hosts))
}
case <-c.stop:
ticker.Stop()
return
}
}
}()
return c, nil
}
func isUpdated(a, b []string) bool {
if len(a) != len(b) {
return true
}
sort.Slice(a, func(i, j int) bool { return a[i] < a[j] })
sort.Slice(b, func(i, j int) bool { return b[i] < b[j] })
for i := 0; i < len(a); i++ {
if a[i] != b[i] {
return true
}
}
return false
}
func getClient(cfg *Config, hosts []string) rpc.Client {
lbConfig := &rpc.LbConfig{
Hosts: hosts,
FailRetryIntervalS: cfg.FailRetryIntervalS,
MaxFailsPeriodS: cfg.MaxFailsPeriodS,
HostTryTimes: cfg.HostTryTimes,
RequestTryTimes: cfg.MaxHostRetry,
ShouldRetry: shouldRetry,
}
if cfg.RPCConfig == nil {
rpcConfig := cfg.ConnMode.getConfig(cfg.BodyBandwidthMBPs,
cfg.ClientTimeoutMs, cfg.BodyBaseTimeoutMs)
lbConfig.Config = rpcConfig
return rpc.NewLbClient(lbConfig, nil)
}
lbConfig.Config = *cfg.RPCConfig
return rpc.NewLbClient(lbConfig, nil)
}
func (c *client) Put(ctx context.Context, args *PutArgs) (location Location, hashSumMap HashSumMap, err error) {
if args.Size == 0 {
hashSumMap := args.Hashes.ToHashSumMap()
for alg := range hashSumMap {
hashSumMap[alg] = alg.ToHasher().Sum(nil)
}
return Location{Blobs: make([]SliceInfo, 0)}, hashSumMap, nil
}
ctx = withReqidContext(ctx)
if args.Size <= c.config.MaxSizePutOnce {
return c.putObject(ctx, args)
}
return c.putParts(ctx, args)
}
func (c *client) putObject(ctx context.Context, args *PutArgs) (location Location, hashSumMap HashSumMap, err error) {
rpcClient := c.rpcClient.Load().(rpc.Client)
urlStr := fmt.Sprintf("/put?size=%d&hashes=%d", args.Size, args.Hashes)
req, err := http.NewRequest(http.MethodPut, urlStr, args.Body)
if err != nil {
return
}
resp := &PutResp{}
if err = rpcClient.DoWith(ctx, req, resp, rpc.WithCrcEncode()); err == nil {
location = resp.Location
hashSumMap = resp.HashSumMap
}
return
}
type blobPart struct {
cid proto.ClusterID
vid proto.Vid
bid proto.BlobID
size int
token string
buf []byte
}
func (c *client) putPartsBatch(ctx context.Context, parts []blobPart) error {
rpcClient := c.rpcClient.Load().(rpc.Client)
tasks := make([]func() error, 0, len(parts))
for _, part := range parts {
part := part
tasks = append(tasks, func() error {
urlStr := fmt.Sprintf("/putat?clusterid=%d&volumeid=%d&blobid=%d&size=%d&hashes=%d&token=%s",
part.cid, part.vid, part.bid, part.size, 0, part.token)
req, err := http.NewRequest(http.MethodPut, urlStr, bytes.NewReader(part.buf))
if err != nil {
return err
}
resp := &PutAtResp{}
return rpcClient.DoWith(ctx, req, resp, rpc.WithCrcEncode())
})
}
if err := task.Run(context.Background(), tasks...); err != nil {
for _, part := range parts {
part := part
// asynchronously delete blob
go func() {
urlStr := fmt.Sprintf("/deleteblob?clusterid=%d&volumeid=%d&blobid=%d&size=%d&token=%s",
part.cid, part.vid, part.bid, part.size, part.token)
req, err := http.NewRequest(http.MethodDelete, urlStr, nil)
if err != nil {
return
}
rpcClient.DoWith(ctx, req, nil)
}()
}
return err
}
return nil
}
func (c *client) readerPipeline(span trace.Span, reqBody io.Reader,
closeCh <-chan struct{}, size, blobSize int) <-chan []byte {
ch := make(chan []byte, c.config.PartConcurrence-1)
go func() {
for size > 0 {
toread := blobSize
if toread > size {
toread = size
}
buf, _ := memPool.Alloc(toread)
buf = buf[:toread]
_, err := io.ReadFull(reqBody, buf)
if err != nil {
span.Error("read buffer from request", err)
memPool.Put(buf)
close(ch)
return
}
select {
case <-closeCh:
memPool.Put(buf)
close(ch)
return
case ch <- buf:
}
size -= toread
}
close(ch)
}()
return ch
}
func (c *client) putParts(ctx context.Context, args *PutArgs) (Location, HashSumMap, error) {
span := trace.SpanFromContextSafe(ctx)
rpcClient := c.rpcClient.Load().(rpc.Client)
hashSumMap := args.Hashes.ToHashSumMap()
hasherMap := make(HasherMap, len(hashSumMap))
for alg := range hashSumMap {
hasherMap[alg] = alg.ToHasher()
}
reqBody := args.Body
if len(hasherMap) > 0 {
reqBody = io.TeeReader(args.Body, hasherMap.ToWriter())
}
var (
loc Location
tokens []string
)
signArgs := SignArgs{}
success := false
defer func() {
if success {
return
}
locations := signArgs.Locations[:]
if len(locations) > 1 {
signArgs.Location = loc.Copy()
signResp := &SignResp{}
if err := rpcClient.PostWith(ctx, "/sign", signResp, signArgs); err == nil {
locations = []Location{signResp.Location.Copy()}
}
}
if len(locations) > 0 {
if _, err := c.Delete(ctx, &DeleteArgs{Locations: locations}); err != nil {
span.Warnf("clean location '%+v' failed %s", locations, err.Error())
}
}
}()
// alloc
allocResp := &AllocResp{}
if err := rpcClient.PostWith(ctx, "/alloc", allocResp, AllocArgs{Size: uint64(args.Size)}); err != nil {
return allocResp.Location, nil, err
}
loc = allocResp.Location
tokens = allocResp.Tokens
signArgs.Locations = append(signArgs.Locations, loc.Copy())
// buffer pipeline
closeCh := make(chan struct{})
bufferPipe := c.readerPipeline(span, reqBody, closeCh, int(loc.Size), int(loc.BlobSize))
defer func() {
close(closeCh)
// waiting pipeline close if has error
for buf := range bufferPipe {
if len(buf) > 0 {
memPool.Put(buf)
}
}
}()
releaseBuffer := func(parts []blobPart) {
for _, part := range parts {
memPool.Put(part.buf)
}
}
currBlobIdx := 0
currBlobCount := uint32(0)
remainSize := loc.Size
restPartsLoc := loc
readSize := 0
for readSize < int(loc.Size) {
parts := make([]blobPart, 0, c.config.PartConcurrence)
// waiting at least one blob
buf, ok := <-bufferPipe
if !ok && readSize < int(loc.Size) {
return Location{}, nil, errcode.ErrAccessReadRequestBody
}
readSize += len(buf)
parts = append(parts, blobPart{size: len(buf), buf: buf})
more := true
for more && len(parts) < c.config.PartConcurrence {
select {
case buf, ok := <-bufferPipe:
if !ok {
if readSize < int(loc.Size) {
releaseBuffer(parts)
return Location{}, nil, errcode.ErrAccessReadRequestBody
}
more = false
} else {
readSize += len(buf)
parts = append(parts, blobPart{size: len(buf), buf: buf})
}
default:
more = false
}
}
tryTimes := c.config.MaxPartRetry
for {
if len(loc.Blobs) > MaxLocationBlobs {
releaseBuffer(parts)
return Location{}, nil, errcode.ErrUnexpected
}
// feed new params
currIdx := currBlobIdx
currCount := currBlobCount
for i := range parts {
token := tokens[currIdx]
if restPartsLoc.Size > uint64(loc.BlobSize) && parts[i].size < int(loc.BlobSize) {
token = tokens[currIdx+1]
}
parts[i].token = token
parts[i].cid = loc.ClusterID
parts[i].vid = loc.Blobs[currIdx].Vid
parts[i].bid = loc.Blobs[currIdx].MinBid + proto.BlobID(currCount)
currCount++
if loc.Blobs[currIdx].Count == currCount {
currIdx++
currCount = 0
}
}
err := c.putPartsBatch(ctx, parts)
if err == nil {
for _, part := range parts {
remainSize -= uint64(part.size)
currBlobCount++
// next blobs
if loc.Blobs[currBlobIdx].Count == currBlobCount {
currBlobIdx++
currBlobCount = 0
}
}
break
}
span.Warn("putat parts", err)
if tryTimes > 0 { // has retry setting
if tryTimes == 1 {
releaseBuffer(parts)
span.Error("exceed the max retry limit", c.config.MaxPartRetry)
return Location{}, nil, errcode.ErrUnexpected
}
tryTimes--
}
var restPartsResp *AllocResp
// alloc the rest parts
err = retry.Timed(3, 10).RuptOn(func() (bool, error) {
resp := &AllocResp{}
if err := rpcClient.PostWith(ctx, "/alloc", resp, AllocArgs{
Size: remainSize,
BlobSize: loc.BlobSize,
CodeMode: loc.CodeMode,
AssignClusterID: loc.ClusterID,
}); err != nil {
return true, err
}
if len(resp.Location.Blobs) > 0 {
if newVid := resp.Location.Blobs[0].Vid; newVid == loc.Blobs[currBlobIdx].Vid {
return false, fmt.Errorf("alloc the same vid %d", newVid)
}
}
restPartsResp = resp
return true, nil
})
if err != nil {
releaseBuffer(parts)
span.Error("alloc another parts to put", err)
return Location{}, nil, errcode.ErrUnexpected
}
restPartsLoc = restPartsResp.Location
signArgs.Locations = append(signArgs.Locations, restPartsLoc.Copy())
if currBlobCount > 0 {
loc.Blobs[currBlobIdx].Count = currBlobCount
currBlobIdx++
}
loc.Blobs = append(loc.Blobs[:currBlobIdx], restPartsLoc.Blobs...)
tokens = append(tokens[:currBlobIdx], restPartsResp.Tokens...)
currBlobCount = 0
}
releaseBuffer(parts)
}
if len(signArgs.Locations) > 1 {
signArgs.Location = loc.Copy()
// sign
signResp := &SignResp{}
if err := rpcClient.PostWith(ctx, "/sign", signResp, signArgs); err != nil {
span.Error("sign location with crc", err)
return Location{}, nil, errcode.ErrUnexpected
}
loc = signResp.Location
}
for alg, hasher := range hasherMap {
hashSumMap[alg] = hasher.Sum(nil)
}
success = true
return loc, hashSumMap, nil
}
func (c *client) Get(ctx context.Context, args *GetArgs) (body io.ReadCloser, err error) {
if !args.IsValid() {
return nil, errcode.ErrIllegalArguments
}
rpcClient := c.rpcClient.Load().(rpc.Client)
ctx = withReqidContext(ctx)
if args.Location.Size == 0 || args.ReadSize == 0 {
return noopBody{}, nil
}
resp, err := rpcClient.Post(ctx, "/get", args)
if err != nil {
return nil, err
}
if resp.StatusCode >= 400 {
return nil, rpc.NewError(resp.StatusCode, "StatusCode", fmt.Errorf("code: %d", resp.StatusCode))
}
return resp.Body, nil
}
func (c *client) Delete(ctx context.Context, args *DeleteArgs) ([]Location, error) {
if !args.IsValid() {
if args == nil {
return nil, errcode.ErrIllegalArguments
}
return args.Locations, errcode.ErrIllegalArguments
}
rpcClient := c.rpcClient.Load().(rpc.Client)
ctx = withReqidContext(ctx)
locations := make([]Location, 0, len(args.Locations))
for _, loc := range args.Locations {
if loc.Size > 0 {
locations = append(locations, loc.Copy())
}
}
if len(locations) == 0 {
return nil, nil
}
if err := retry.Timed(3, 10).On(func() error {
// access response 2xx even if there has failed locations
deleteResp := &DeleteResp{}
if err := rpcClient.PostWith(ctx, "/delete", deleteResp,
DeleteArgs{Locations: locations}); err != nil && rpc.DetectStatusCode(err) != http.StatusIMUsed {
return err
}
if len(deleteResp.FailedLocations) > 0 {
locations = deleteResp.FailedLocations[:]
return errcode.ErrUnexpected
}
return nil
}); err != nil {
return locations, err
}
return nil, nil
}
func shouldRetry(code int, err error) bool {
if err != nil {
if httpErr, ok := err.(rpc.HTTPError); ok {
// 500 need to retry next host
return httpErr.StatusCode() == http.StatusInternalServerError
}
return true
}
if code/100 != 4 && code/100 != 2 {
return true
}
return false
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package access
import (
"context"
"github.com/cubefs/cubefs/blobstore/common/trace"
)
type ctxKey uint8
const (
_operationName = "access_client"
)
const (
_ ctxKey = iota
reqidKey
)
// WithRequestID trace request id in full life of the request
// The second parameter rid could be the one of type below:
// a string,
// an interface { String() string },
// an interface { TraceID() string },
// an interface { RequestID() string },
func WithRequestID(ctx context.Context, rid interface{}) context.Context {
return context.WithValue(ctx, reqidKey, rid)
}
func reqidFromContext(ctx context.Context) (string, bool) {
val := ctx.Value(reqidKey)
if val == nil {
return "", false
}
if rid, ok := val.(string); ok {
return rid, true
}
if rid, ok := val.(interface{ String() string }); ok {
return rid.String(), true
}
if rid, ok := val.(interface{ TraceID() string }); ok {
return rid.TraceID(), true
}
if rid, ok := val.(interface{ RequestID() string }); ok {
return rid.RequestID(), true
}
return "", false
}
func withReqidContext(ctx context.Context) context.Context {
if rid, ok := reqidFromContext(ctx); ok {
_, ctx := trace.StartSpanFromContextWithTraceID(ctx, _operationName, rid)
return ctx
}
if span := trace.SpanFromContext(ctx); span != nil {
return ctx
}
_, ctx = trace.StartSpanFromContext(ctx, _operationName)
return ctx
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package access
import (
"crypto/md5"
"crypto/sha1"
"crypto/sha256"
"encoding/base64"
"encoding/binary"
"encoding/hex"
"fmt"
"hash"
"hash/crc32"
"io"
"github.com/cubefs/cubefs/blobstore/common/codemode"
"github.com/cubefs/cubefs/blobstore/common/proto"
)
// HashAlgorithm hash.Hash algorithm when uploading data
type HashAlgorithm uint8
// defined hash algorithm
const (
HashAlgDummy HashAlgorithm = 1 << iota
HashAlgCRC32 // crc32 with IEEE
HashAlgMD5 // md5
HashAlgSHA1 // sha1
HashAlgSHA256 // sha256
)
const (
// HashSize dummy hash size
HashSize = 0
// MaxLocationBlobs max blobs length in Location
MaxLocationBlobs int = 4
// MaxDeleteLocations max locations of delete request
MaxDeleteLocations int = 1024
// MaxBlobSize max blob size for allocation
MaxBlobSize uint32 = 1 << 25 // 32MB
)
type dummyHash struct{}
var _ hash.Hash = (*dummyHash)(nil)
// implements hash.Hash
func (d dummyHash) Write(p []byte) (n int, err error) { return len(p), nil }
func (d dummyHash) Sum(b []byte) []byte { return []byte{} }
func (d dummyHash) Reset() { _ = struct{}{} }
func (d dummyHash) Size() int { return 0 }
func (d dummyHash) BlockSize() int { return 0 }
// ToHasher returns a new hash.Hash computing checksum
// the value of algorithm should be one of HashAlg*
func (alg HashAlgorithm) ToHasher() hash.Hash {
switch alg {
case HashAlgCRC32:
return crc32.NewIEEE()
case HashAlgMD5:
return md5.New()
case HashAlgSHA1:
return sha1.New()
case HashAlgSHA256:
return sha256.New()
default:
return dummyHash{}
}
}
// ToHashSumMap returns a new HashSumMap, decode from rpc url argument
func (alg HashAlgorithm) ToHashSumMap() HashSumMap {
h := make(HashSumMap)
for _, a := range []HashAlgorithm{
HashAlgDummy,
HashAlgCRC32,
HashAlgMD5,
HashAlgSHA1,
HashAlgSHA256,
} {
if alg&a == a {
h[a] = nil
}
}
return h
}
// HasherMap map hasher of HashAlgorithm
type HasherMap map[HashAlgorithm]hash.Hash
// ToHashAlgorithm returns HashAlgorithm
func (h HasherMap) ToHashAlgorithm() HashAlgorithm {
alg := HashAlgorithm(0)
for k := range h {
alg |= k
}
return alg
}
// ToWriter returns io writer
func (h HasherMap) ToWriter() io.Writer {
writers := make([]io.Writer, 0, len(h))
for _, hasher := range h {
writers = append(writers, hasher)
}
return io.MultiWriter(writers...)
}
// HashSumMap save checksum in rpc calls
type HashSumMap map[HashAlgorithm][]byte
// GetSum get checksum value and ok via HashAlgorithm
//
// HashAlgDummy returns nil, bool
// HashAlgCRC32 returns uint32, bool
// HashAlgMD5 returns string(32), bool
// HashAlgSHA1 returns string(40), bool
// HashAlgSHA256 returns string(64), bool
func (h HashSumMap) GetSum(key HashAlgorithm) (interface{}, bool) {
b, ok := h[key]
if !ok {
return nil, false
}
switch key {
case HashAlgCRC32:
if len(b) != crc32.Size {
return nil, false
}
return uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24, true
case HashAlgMD5:
if len(b) != md5.Size {
return nil, false
}
return hex.EncodeToString(b[:]), true
case HashAlgSHA1:
if len(b) != sha1.Size {
return nil, false
}
return hex.EncodeToString(b[:]), true
case HashAlgSHA256:
if len(b) != sha256.Size {
return nil, false
}
return hex.EncodeToString(b[:]), true
default:
if len(b) != HashSize {
return nil, false
}
return nil, true
}
}
// GetSumVal get checksum only value via HashAlgorithm
func (h HashSumMap) GetSumVal(key HashAlgorithm) interface{} {
val, _ := h.GetSum(key)
return val
}
// ToHashAlgorithm returns HashAlgorithm, encode to rpc url argument
func (h HashSumMap) ToHashAlgorithm() HashAlgorithm {
alg := HashAlgorithm(0)
for k := range h {
alg |= k
}
return alg
}
// All returns readable checksum
func (h HashSumMap) All() map[string]interface{} {
m := make(map[string]interface{})
for a, name := range map[HashAlgorithm]string{
HashAlgCRC32: "crc32",
HashAlgMD5: "md5",
HashAlgSHA1: "sha1",
HashAlgSHA256: "sha256",
} {
if val, ok := h.GetSum(a); ok {
m[name] = val
}
}
return m
}
// Location file location, 4 + 1 + 8 + 4 + 4 + len*16 bytes
// | |
// | ClusterID(4) | CodeMode(1) |
// | Size(8) |
// | BlobSize(4) | Crc(4) |
// | len*SliceInfo(16) |
//
// ClusterID which cluster file is in
// CodeMode is ec encode mode, see defined in "common/lib/codemode"
// Size is file size
// BlobSize is every blob's size but the last one which's size=(Size mod BlobSize)
// Crc is the checksum, change anything of the location, crc will mismatch
// Blobs all blob information
type Location struct {
_ [0]byte
ClusterID proto.ClusterID `json:"cluster_id"`
CodeMode codemode.CodeMode `json:"code_mode"`
Size uint64 `json:"size"`
BlobSize uint32 `json:"blob_size"`
Crc uint32 `json:"crc"`
Blobs []SliceInfo `json:"blobs"`
}
// SliceInfo blobs info, 8 + 4 + 4 bytes
//
// MinBid is the first blob id
// Vid is which volume all blobs in
// Count is num of consecutive blob ids, count=1 just has one blob
//
// blob ids = [MinBid, MinBid+count)
type SliceInfo struct {
_ [0]byte
MinBid proto.BlobID `json:"min_bid"`
Vid proto.Vid `json:"vid"`
Count uint32 `json:"count"`
}
// Blob is one piece of data in a location
//
// Bid is the blob id
// Vid is which volume the blob in
// Size is real size of the blob
type Blob struct {
Bid proto.BlobID
Vid proto.Vid
Size uint32
}
// Copy returns a new same Location
func (loc *Location) Copy() Location {
dst := Location{
ClusterID: loc.ClusterID,
CodeMode: loc.CodeMode,
Size: loc.Size,
BlobSize: loc.BlobSize,
Crc: loc.Crc,
Blobs: make([]SliceInfo, len(loc.Blobs)),
}
copy(dst.Blobs, loc.Blobs)
return dst
}
// Encode transfer Location to slice byte
// Returns the buf created by me
//
// (n) means max-n bytes
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | field | crc | clusterid | codemode | size | blobsize |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | n-bytes | 4 | uvarint(5) | 1 | uvarint(10) | uvarint(5) |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// 25 + (5){len(blobs)} + len(Blobs) * 20
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | blobs | minbid | vid | count | ... |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | n-bytes | (10) | (5) | (5) | (20) | (20) | ... |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
func (loc *Location) Encode() []byte {
if loc == nil {
return nil
}
n := 25 + 5 + len(loc.Blobs)*20
buf := make([]byte, n)
n = loc.Encode2(buf)
return buf[:n]
}
// Encode2 transfer Location to the buf, the buf reuse by yourself
// Returns the number of bytes read
// If the buffer is too small, Encode2 will panic
func (loc *Location) Encode2(buf []byte) int {
if loc == nil {
return 0
}
n := 0
binary.BigEndian.PutUint32(buf[n:], loc.Crc)
n += 4
n += binary.PutUvarint(buf[n:], uint64(loc.ClusterID))
buf[n] = byte(loc.CodeMode)
n++
n += binary.PutUvarint(buf[n:], uint64(loc.Size))
n += binary.PutUvarint(buf[n:], uint64(loc.BlobSize))
n += binary.PutUvarint(buf[n:], uint64(len(loc.Blobs)))
for _, blob := range loc.Blobs {
n += binary.PutUvarint(buf[n:], uint64(blob.MinBid))
n += binary.PutUvarint(buf[n:], uint64(blob.Vid))
n += binary.PutUvarint(buf[n:], uint64(blob.Count))
}
return n
}
// Decode parse location from buf
// Returns the number of bytes read
// Error is not nil when parsing failed
func (loc *Location) Decode(buf []byte) (int, error) {
if loc == nil {
return 0, fmt.Errorf("location receiver is nil")
}
location, n, err := DecodeLocation(buf)
if err != nil {
return n, err
}
*loc = location
return n, nil
}
// ToString transfer location to hex string
func (loc *Location) ToString() string {
return loc.HexString()
}
// HexString transfer location to hex string
func (loc *Location) HexString() string {
return hex.EncodeToString(loc.Encode())
}
// Base64String transfer location to base64 string
func (loc *Location) Base64String() string {
return base64.StdEncoding.EncodeToString(loc.Encode())
}
// Spread location blobs to slice
func (loc *Location) Spread() []Blob {
count := 0
for _, blob := range loc.Blobs {
count += int(blob.Count)
}
blobs := make([]Blob, 0, count)
for _, blob := range loc.Blobs {
for offset := uint32(0); offset < blob.Count; offset++ {
blobs = append(blobs, Blob{
Bid: blob.MinBid + proto.BlobID(offset),
Vid: blob.Vid,
Size: loc.BlobSize,
})
}
}
if len(blobs) > 0 && loc.BlobSize > 0 {
if lastSize := loc.Size % uint64(loc.BlobSize); lastSize > 0 {
blobs[len(blobs)-1].Size = uint32(lastSize)
}
}
return blobs
}
// DecodeLocation parse location from buf
// Returns Location and the number of bytes read
// Error is not nil when parsing failed
func DecodeLocation(buf []byte) (Location, int, error) {
var (
loc Location
n int
val uint64
nn int
)
next := func() (uint64, int) {
val, nn := binary.Uvarint(buf)
if nn <= 0 {
return 0, nn
}
n += nn
buf = buf[nn:]
return val, nn
}
if len(buf) < 4 {
return loc, n, fmt.Errorf("bytes crc %d", len(buf))
}
loc.Crc = binary.BigEndian.Uint32(buf)
n += 4
buf = buf[4:]
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes cluster_id %d", nn)
}
loc.ClusterID = proto.ClusterID(val)
if len(buf) < 1 {
return loc, n, fmt.Errorf("bytes codemode %d", len(buf))
}
loc.CodeMode = codemode.CodeMode(buf[0])
n++
buf = buf[1:]
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes size %d", nn)
}
loc.Size = val
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes blob_size %d", nn)
}
loc.BlobSize = uint32(val)
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes length blobs %d", nn)
}
length := int(val)
if length > 0 {
loc.Blobs = make([]SliceInfo, 0, length)
}
for index := 0; index < length; index++ {
var blob SliceInfo
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes %dth-blob min_bid %d", index, nn)
}
blob.MinBid = proto.BlobID(val)
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes %dth-blob vid %d", index, nn)
}
blob.Vid = proto.Vid(val)
if val, nn = next(); nn <= 0 {
return loc, n, fmt.Errorf("bytes %dth-blob count %d", index, nn)
}
blob.Count = uint32(val)
loc.Blobs = append(loc.Blobs, blob)
}
return loc, n, nil
}
// DecodeLocationFrom decode location from hex string
func DecodeLocationFrom(s string) (Location, error) {
return DecodeLocationFromHex(s)
}
// DecodeLocationFromHex decode location from hex string
func DecodeLocationFromHex(s string) (Location, error) {
var loc Location
src, err := hex.DecodeString(s)
if err != nil {
return loc, err
}
_, err = loc.Decode(src)
if err != nil {
return loc, err
}
return loc, nil
}
// DecodeLocationFromBase64 decode location from base64 string
func DecodeLocationFromBase64(s string) (Location, error) {
var loc Location
src, err := base64.StdEncoding.DecodeString(s)
if err != nil {
return loc, err
}
_, err = loc.Decode(src)
if err != nil {
return loc, err
}
return loc, nil
}
// PutArgs for service /put
// Hashes means how to calculate check sum,
// HashAlgCRC32 | HashAlgMD5 equal 2 + 4 = 6
type PutArgs struct {
Size int64 `json:"size"`
Hashes HashAlgorithm `json:"hashes,omitempty"`
Body io.Reader `json:"-"`
}
// IsValid is valid put args
func (args *PutArgs) IsValid() bool {
if args == nil {
return false
}
return args.Size > 0
}
// PutResp put response result
type PutResp struct {
Location Location `json:"location"`
HashSumMap HashSumMap `json:"hashsum"`
}
// PutAtArgs for service /putat
type PutAtArgs struct {
ClusterID proto.ClusterID `json:"clusterid"`
Vid proto.Vid `json:"volumeid"`
BlobID proto.BlobID `json:"blobid"`
Size int64 `json:"size"`
Hashes HashAlgorithm `json:"hashes,omitempty"`
Token string `json:"token"`
Body io.Reader `json:"-"`
}
// IsValid is valid putat args
func (args *PutAtArgs) IsValid() bool {
if args == nil {
return false
}
return args.ClusterID > proto.ClusterID(0) &&
args.Vid > proto.Vid(0) &&
args.BlobID > proto.BlobID(0) &&
args.Size > 0
}
// PutAtResp putat response result
type PutAtResp struct {
HashSumMap HashSumMap `json:"hashsum"`
}
// AllocArgs for service /alloc
type AllocArgs struct {
Size uint64 `json:"size"`
BlobSize uint32 `json:"blob_size"`
AssignClusterID proto.ClusterID `json:"assign_cluster_id"`
CodeMode codemode.CodeMode `json:"code_mode"`
}
// IsValid is valid alloc args
func (args *AllocArgs) IsValid() bool {
if args == nil {
return false
}
if args.AssignClusterID > 0 {
return args.Size > 0 && args.BlobSize > 0 && args.BlobSize <= MaxBlobSize &&
args.CodeMode.IsValid()
}
return args.Size > 0 && args.BlobSize <= MaxBlobSize
}
// AllocResp alloc response result with tokens
// if size mod blobsize == 0, length of tokens equal length of location blobs
// otherwise additional token for the last blob uploading
type AllocResp struct {
Location Location `json:"location"`
Tokens []string `json:"tokens"`
}
// GetArgs for service /get
type GetArgs struct {
Location Location `json:"location"`
Offset uint64 `json:"offset"`
ReadSize uint64 `json:"read_size"`
}
// IsValid is valid get args
func (args *GetArgs) IsValid() bool {
if args == nil {
return false
}
return args.Offset <= args.Location.Size &&
args.ReadSize <= args.Location.Size &&
args.Offset+args.ReadSize <= args.Location.Size
}
// DeleteArgs for service /delete
type DeleteArgs struct {
Locations []Location `json:"locations"`
}
// IsValid is valid delete args
func (args *DeleteArgs) IsValid() bool {
if args == nil {
return false
}
return len(args.Locations) > 0 && len(args.Locations) <= MaxDeleteLocations
}
// DeleteResp delete response with failed locations
type DeleteResp struct {
FailedLocations []Location `json:"failed_locations,omitempty"`
}
// DeleteBlobArgs for service /deleteblob
type DeleteBlobArgs struct {
ClusterID proto.ClusterID `json:"clusterid"`
Vid proto.Vid `json:"volumeid"`
BlobID proto.BlobID `json:"blobid"`
Size int64 `json:"size"`
Token string `json:"token"`
}
// IsValid is valid delete blob args
func (args *DeleteBlobArgs) IsValid() bool {
if args == nil {
return false
}
return args.ClusterID > proto.ClusterID(0) &&
args.Vid > proto.Vid(0) &&
args.BlobID > proto.BlobID(0) &&
args.Size > 0
}
// SignArgs for service /sign
// Locations are signed location getting from /alloc
// Location is to be signed location which merged by yourself
type SignArgs struct {
Locations []Location `json:"locations"`
Location Location `json:"location"`
}
// IsValid is valid sign args
func (args *SignArgs) IsValid() bool {
if args == nil {
return false
}
return len(args.Locations) > 0
}
// SignResp sign response location with crc
type SignResp struct {
Location Location `json:"location"`
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package codemode
import "fmt"
type (
// CodeMode EC encode and decode mode
CodeMode uint8
CodeModeName string
)
// pre-defined mode
const (
EC15P12 CodeMode = 1
EC6P6 CodeMode = 2
EC16P20L2 CodeMode = 3
EC6P10L2 CodeMode = 4
EC6P3L3 CodeMode = 5
EC6P6Align0 CodeMode = 6
EC6P6Align512 CodeMode = 7
EC4P4L2 CodeMode = 8
EC12P4 CodeMode = 9
EC16P4 CodeMode = 10
EC3P3 CodeMode = 11
EC10P4 CodeMode = 12
EC6P3 CodeMode = 13
EC12P9 CodeMode = 14
// for test
EC6P6L9 CodeMode = 200
EC6P8L10 CodeMode = 201
)
// Note: Don't modify it unless you know very well how codemode works.
const (
// align size per shard
alignSize0B = 0 // 0B
alignSize512B = 512 // 512B
alignSize2KB = 2048 // 2KB
)
// The tactic is fixed pairing with one codemode.
// Add a new codemode if you want other features.
var constCodeModeTactic = map[CodeMode]Tactic{
// three az
EC15P12: {N: 15, M: 12, L: 0, AZCount: 3, PutQuorum: 24, GetQuorum: 0, MinShardSize: alignSize2KB},
EC6P6: {N: 6, M: 6, L: 0, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize2KB},
EC12P9: {N: 12, M: 9, L: 0, AZCount: 3, PutQuorum: 20, GetQuorum: 0, MinShardSize: alignSize2KB},
// two az
EC16P20L2: {N: 16, M: 20, L: 2, AZCount: 2, PutQuorum: 34, GetQuorum: 0, MinShardSize: alignSize2KB},
EC6P10L2: {N: 6, M: 10, L: 2, AZCount: 2, PutQuorum: 14, GetQuorum: 0, MinShardSize: alignSize2KB},
// single az
EC12P4: {N: 12, M: 4, L: 0, AZCount: 1, PutQuorum: 15, GetQuorum: 0, MinShardSize: alignSize2KB},
EC16P4: {N: 16, M: 4, L: 0, AZCount: 1, PutQuorum: 19, GetQuorum: 0, MinShardSize: alignSize2KB},
EC3P3: {N: 3, M: 3, L: 0, AZCount: 1, PutQuorum: 5, GetQuorum: 0, MinShardSize: alignSize2KB},
EC10P4: {N: 10, M: 4, L: 0, AZCount: 1, PutQuorum: 13, GetQuorum: 0, MinShardSize: alignSize2KB},
EC6P3: {N: 6, M: 3, L: 0, AZCount: 1, PutQuorum: 8, GetQuorum: 0, MinShardSize: alignSize2KB},
// for env test
EC6P3L3: {N: 6, M: 3, L: 3, AZCount: 3, PutQuorum: 9, GetQuorum: 0, MinShardSize: alignSize2KB},
EC6P6Align0: {N: 6, M: 6, L: 0, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize0B},
EC6P6Align512: {N: 6, M: 6, L: 0, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize512B},
EC4P4L2: {N: 4, M: 4, L: 2, AZCount: 2, PutQuorum: 6, GetQuorum: 0, MinShardSize: alignSize2KB},
EC6P6L9: {N: 6, M: 6, L: 9, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize2KB},
EC6P8L10: {N: 6, M: 8, L: 10, AZCount: 2, PutQuorum: 13, GetQuorum: 0, MinShardSize: alignSize0B},
}
var constName2CodeMode = map[CodeModeName]CodeMode{
"EC15P12": EC15P12,
"EC6P6": EC6P6,
"EC16P20L2": EC16P20L2,
"EC6P10L2": EC6P10L2,
"EC6P3L3": EC6P3L3,
"EC6P6Align0": EC6P6Align0,
"EC6P6Align512": EC6P6Align512,
"EC4P4L2": EC4P4L2,
"EC12P4": EC12P4,
"EC16P4": EC16P4,
"EC3P3": EC3P3,
"EC10P4": EC10P4,
"EC6P3": EC6P3,
"EC6P6L9": EC6P6L9,
"EC6P8L10": EC6P8L10,
"EC12P9": EC12P9,
}
var constCodeMode2Name = map[CodeMode]CodeModeName{
EC15P12: "EC15P12",
EC6P6: "EC6P6",
EC16P20L2: "EC16P20L2",
EC6P10L2: "EC6P10L2",
EC6P3L3: "EC6P3L3",
EC6P6Align0: "EC6P6Align0",
EC6P6Align512: "EC6P6Align512",
EC4P4L2: "EC4P4L2",
EC12P4: "EC12P4",
EC16P4: "EC16P4",
EC3P3: "EC3P3",
EC10P4: "EC10P4",
EC6P3: "EC6P3",
EC6P6L9: "EC6P6L9",
EC6P8L10: "EC6P8L10",
EC12P9: "EC12P9",
}
//vol layout ep:EC6P10L2
//|----N------|--------M----------------|--L--|
//|0,1,2,3,4,5|6,7,8,9,10,11,12,13,14,15|16,17|
// global stripe:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], n=6 m=10
// two local stripes:
// local stripe1:[0,1,2, 6, 7, 8, 9,10, 16] n=8 m=1
// local stripe2:[3,4,5, 11,12,13,14,15, 17] n=8 m=1
// Tactic constant strategy of one CodeMode
type Tactic struct {
N int
M int
// local parity count
L int
// the count of AZ, access use this for split data shards and parity shards
AZCount int
// PutQuorum write quorum,
// MUST make sure that ec data is recoverable if one AZ was down
// We SHOULD ignore the local shards
// (N + M) / AZCount + N <= PutQuorum <= M + N
PutQuorum int
// get quorum config
GetQuorum int
// MinShardSize min size per shard, fill data into shards 0-N continuously,
// align with zero bytes if data size less than MinShardSize*N
//
// length of data less than MinShardSize*N, size of per shard = MinShardSize
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | data | align zero bytes |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | 0 | 1 | 2 | .... | N |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//
// length of data more than MinShardSize*N, size of per shard = ceil(len(data)/N)
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | data |padding|
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | 0 | 1 | 2 | .... | N |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
MinShardSize int
}
func init() {
// assert all codemode
for _, pair := range []struct {
Mode CodeMode
Size int
}{
{Mode: EC15P12, Size: alignSize2KB},
{Mode: EC6P6, Size: alignSize2KB},
{Mode: EC12P9, Size: alignSize2KB},
{Mode: EC16P20L2, Size: alignSize2KB},
{Mode: EC6P10L2, Size: alignSize2KB},
{Mode: EC6P3L3, Size: alignSize2KB},
{Mode: EC6P6Align0, Size: alignSize0B},
{Mode: EC6P6Align512, Size: alignSize512B},
} {
tactic := pair.Mode.Tactic()
if !tactic.IsValid() {
panic(fmt.Sprintf("Invalid codemode:%d Tactic:%+v", pair.Mode, tactic))
}
min := tactic.N + (tactic.N+tactic.M)/tactic.AZCount
max := tactic.N + tactic.M
if tactic.PutQuorum < min || tactic.PutQuorum > max {
panic(fmt.Sprintf("Invalid codemode:%d PutQuorum:%d([%d,%d])", pair.Mode,
tactic.PutQuorum, min, max))
}
if tactic.MinShardSize != pair.Size {
panic(fmt.Sprintf("Invalid codemode:%d MinShardSize:%d(%d)", pair.Mode,
tactic.MinShardSize, pair.Size))
}
}
}
// T returns pointer of Tactic, used like:
// EC6P6.T().AllLocalStripe()
func (c CodeMode) T() *Tactic {
tactic := c.Tactic()
return &tactic
}
// Tactic returns its constant tactic
func (c CodeMode) Tactic() Tactic {
if tactic, ok := constCodeModeTactic[c]; ok {
return tactic
}
panic(fmt.Sprintf("Invalid codemode:%d", c))
}
// GetShardNum returns all shards number.
func (c CodeMode) GetShardNum() int {
tactic := c.Tactic()
return tactic.L + tactic.M + tactic.N
}
// Name turn the CodeMode to CodeModeName
func (c CodeMode) Name() CodeModeName {
if name, ok := constCodeMode2Name[c]; ok {
return name
}
panic(fmt.Sprintf("codemode: %d is invalid", c))
}
// String turn the CodeMode to string
func (c CodeMode) String() string {
if name, ok := constCodeMode2Name[c]; ok {
return string(name)
}
return ""
}
// IsValid check the CodeMode is valid
func (c CodeMode) IsValid() bool {
if _, ok := constCodeMode2Name[c]; ok {
return ok
}
return false
}
// GetCodeMode get the code mode by name
func (cn CodeModeName) GetCodeMode() CodeMode {
if code, ok := constName2CodeMode[cn]; ok {
return code
}
panic(fmt.Sprintf("codemode: %s is invalid", cn))
}
// IsValid check the CodeMode is valid by Name
func (cn CodeModeName) IsValid() bool {
if _, ok := constName2CodeMode[cn]; ok {
return ok
}
return false
}
// Tactic get tactic by code mode name
func (cn CodeModeName) Tactic() Tactic {
return cn.GetCodeMode().Tactic()
}
// IsValid ec tactic valid or not
func (c *Tactic) IsValid() bool {
return c.N > 0 && c.M > 0 && c.L >= 0 && c.AZCount > 0 &&
c.PutQuorum > 0 && c.GetQuorum >= 0 && c.MinShardSize >= 0 &&
c.N%c.AZCount == 0 && c.M%c.AZCount == 0 && c.L%c.AZCount == 0
}
// GetECLayoutByAZ ec layout by AZ
func (c *Tactic) GetECLayoutByAZ() (azStripes [][]int) {
azStripes = make([][]int, c.AZCount)
n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
for idx := range azStripes {
stripe := make([]int, 0, n+m+l)
for i := 0; i < n; i++ {
stripe = append(stripe, idx*n+i)
}
for i := 0; i < m; i++ {
stripe = append(stripe, c.N+idx*m+i)
}
for i := 0; i < l; i++ {
stripe = append(stripe, c.N+c.M+idx*l+i)
}
azStripes[idx] = stripe
}
return azStripes
}
// GlobalStripe returns initial stripe return name.GetCodeMode().Tactic()
func (c *Tactic) GlobalStripe() (indexes []int, n, m int) {
indexes = make([]int, c.N+c.M)
for i := 0; i < c.N+c.M; i++ {
indexes[i] = i
}
return indexes, c.N, c.M
}
// AllLocalStripe returns all local stripes
func (c *Tactic) AllLocalStripe() (stripes [][]int, n, m int) {
if c.L == 0 {
return
}
n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
return c.GetECLayoutByAZ(), n + m, l
}
// LocalStripe get local stripe by index
func (c *Tactic) LocalStripe(index int) (localStripe []int, n, m int) {
if c.L == 0 {
return nil, 0, 0
}
n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
var azIdx int
if index < c.N {
azIdx = index / n
} else if index < c.N+c.M {
azIdx = (index - c.N) / m
} else if index < c.N+c.M+c.L {
azIdx = (index - c.N - c.M) / l
} else {
return nil, 0, 0
}
return c.LocalStripeInAZ(azIdx)
}
// LocalStripeInAZ get local stripe in az index
func (c *Tactic) LocalStripeInAZ(azIndex int) (localStripe []int, n, m int) {
if c.L == 0 {
return nil, 0, 0
}
n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
azStripes := c.GetECLayoutByAZ()
if azIndex < 0 || azIndex >= len(azStripes) {
return nil, 0, 0
}
return azStripes[azIndex][:], n + m, l
}
// GetAllCodeModes get all the available CodeModes
func GetAllCodeModes() []CodeMode {
return []CodeMode{
EC15P12,
EC6P6,
EC16P20L2,
EC6P10L2,
EC6P3L3,
EC6P6Align0,
EC6P6Align512,
EC4P4L2,
EC12P4,
EC16P4,
EC3P3,
EC10P4,
EC6P3,
EC6P6L9,
EC6P8L10,
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package crc32block
import (
"encoding/binary"
"hash/crc32"
)
const (
defaultCrc32BlockSize = 64 * 1024
)
var gBlockSize int64 = defaultCrc32BlockSize
type blockUnit []byte
func (b blockUnit) length() int {
return len(b)
}
func (b blockUnit) payload() int {
return len(b) - crc32Len
}
func (b blockUnit) check() (err error) {
payloadCrc := crc32.ChecksumIEEE(b[crc32Len:])
if binary.LittleEndian.Uint32(b) != payloadCrc {
return ErrMismatchedCrc
}
return nil
}
func (b blockUnit) writeCrc() {
payloadCrc := crc32.ChecksumIEEE(b[crc32Len:])
binary.LittleEndian.PutUint32(b, payloadCrc)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package crc32block
import (
"bufio"
"io"
"io/ioutil"
)
/*
journal record:
|-----block-----|------block-----|---block---|
block:
|--crc--|------payload -----|
*/
type Decoder struct {
from io.ReaderAt // read from
off int64 // offset. readonly
limit int64 // size limit, readonly
bufSize int64 // for speed
block blockUnit // block buffer
}
type decoderReader struct {
reader io.Reader //
block []byte //
i, j int // block[i:j]
err error //
}
type rangeReader struct {
r io.Reader
limit int64
skip int64
skiped bool
}
type blockReader struct {
reader io.Reader //
block blockUnit //
i, j int // block[i:j] is unread portion of the current block's payload.
remain int64 //
err error //
}
func (br *blockReader) Read(p []byte) (n int, err error) {
if br.err != nil {
return 0, br.err
}
for br.i == br.j {
if br.remain == 0 {
return n, io.EOF
}
br.err = br.nextBlock()
if br.err != nil {
return 0, br.err
}
}
n = copy(p, br.block[br.i:br.j])
br.i += n
br.remain -= int64(n)
return n, nil
}
func (br *blockReader) nextBlock() (err error) {
blockLen := int64(len(br.block))
blockPayloadLen := int64(blockLen - crc32Len)
want := blockLen
if br.remain < blockPayloadLen {
want = br.remain + crc32Len
}
_, err = io.ReadFull(br.reader, br.block[:want])
if err != nil {
br.err = err
return br.err
}
if err = blockUnit(br.block[:want]).check(); err != nil {
return err
}
br.i = crc32Len
br.j = int(want)
return nil
}
func (r *rangeReader) Read(p []byte) (n int, err error) {
if !r.skiped {
_, err := io.CopyN(ioutil.Discard, r.r, r.skip)
if err != nil {
return 0, err
}
r.skiped = true
r.r = io.LimitReader(r.r, r.limit)
}
return r.r.Read(p)
}
func (dec *Decoder) Reader(from, to int64) (r io.Reader, err error) {
blockLen := int64(dec.block.length())
blockPayloadLen := int64(dec.block.payload())
blockOff := (from / blockPayloadLen) * blockLen
encodedSize := EncodeSize(dec.limit, blockLen) - blockOff
// raw reader
r = io.NewSectionReader(dec.from, dec.off+blockOff, encodedSize)
// buffer
r = bufio.NewReaderSize(r, int(dec.bufSize))
// decode reader
r = NewBlockReader(r, DecodeSize(encodedSize, blockLen), dec.block)
// range reader
r = &rangeReader{
r: r,
limit: to - from,
skip: from % blockPayloadLen,
}
return r, nil
}
func (r *decoderReader) Read(b []byte) (n int, err error) {
if r.err != nil {
return 0, r.err
}
for len(b) > 0 {
if r.i == r.j {
if r.err = r.nextBlock(); r.err != nil {
if n > 0 {
return n, nil
}
return n, r.err
}
}
readn := copy(b, r.block[r.i:r.j])
r.i += readn
b = b[readn:]
n += readn
}
return
}
func (r *decoderReader) nextBlock() (err error) {
n, err := readFullOrToEnd(r.reader, r.block)
if err != nil {
return
}
if n <= crc32Len {
return ErrMismatchedCrc
}
if err = blockUnit(r.block[:n]).check(); err != nil {
return ErrMismatchedCrc
}
r.i, r.j = crc32Len, n
return nil
}
func NewBlockReader(r io.Reader, limit int64, block []byte) *blockReader {
if block == nil || !isValidBlockLen(int64(len(block))) {
panic(ErrInvalidBlock)
}
return &blockReader{reader: r, remain: limit, block: block}
}
// NewDecoderReader returns io.Reader
//
// Deprecated: no reused buffer, use NewBodyDecoder to instead.
func NewDecoderReader(in io.Reader) io.Reader {
chunk := make([]byte, defaultCrc32BlockSize)
return &decoderReader{block: chunk, err: nil, reader: in}
}
func NewDecoderWithBlock(r io.ReaderAt, off int64, size int64, block []byte, bufferSize int64) (dec *Decoder, err error) {
if block == nil || !isValidBlockLen(int64(len(block))) {
return nil, ErrInvalidBlock
}
return &Decoder{from: r, off: off, block: block, limit: size, bufSize: bufferSize}, nil
}
func NewDecoder(r io.ReaderAt, off int64, size int64) (dec *Decoder, err error) {
block := make([]byte, defaultCrc32BlockSize)
return NewDecoderWithBlock(r, off, size, block, defaultCrc32BlockSize)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package crc32block
import (
"io"
)
type ReaderError struct {
error
}
type WriterError struct {
error
}
type Encoder struct {
block blockUnit // block buffer
}
type limitEncoderReader struct {
reader io.Reader
block blockUnit
remain int64
i, j int
err error
}
type encoderReader struct {
reader io.Reader //
block blockUnit //
i, j int // block[i:j]
err error //
}
func (enc *Encoder) Encode(from io.Reader, limitSize int64, to io.Writer) (n int64, err error) {
if !isValidBlockLen(int64(enc.block.length())) {
panic(ErrInvalidBlock)
}
encSize := EncodeSize(limitSize, int64(enc.block.length()))
reader := &limitEncoderReader{reader: from, block: enc.block, remain: limitSize}
return io.CopyN(to, reader, encSize)
}
func (r *limitEncoderReader) Read(b []byte) (n int, err error) {
if r.err != nil {
return 0, r.err
}
for len(b) > 0 {
if r.i == r.j {
if r.remain == 0 {
return n, io.EOF
}
if r.err = r.nextBlock(); r.err != nil {
if n > 0 {
return n, nil
}
return n, r.err
}
}
readn := copy(b, r.block[r.i:r.j])
r.i += readn
b = b[readn:]
n += readn
}
return
}
func (r *limitEncoderReader) nextBlock() (err error) {
blockPayloadLen := r.block.payload()
needn := blockPayloadLen
if r.remain < int64(blockPayloadLen) {
needn = int(r.remain)
}
block := blockUnit(r.block[:crc32Len+needn])
n, err := io.ReadFull(r.reader, block[crc32Len:])
if err != nil {
return ReaderError{err}
}
r.i = 0
r.j = crc32Len + n
blockUnit(r.block[r.i:r.j]).writeCrc()
r.remain -= int64(block.payload())
return nil
}
func (r *encoderReader) Read(b []byte) (n int, err error) {
if r.err != nil {
return 0, r.err
}
for len(b) > 0 {
if r.i == r.j {
if r.err = r.nextBlock(); r.err != nil {
if n > 0 {
return n, nil
}
return n, r.err
}
}
readn := copy(b, r.block[r.i:r.j])
r.i += readn
b = b[readn:]
n += readn
}
return
}
func (r *encoderReader) nextBlock() (err error) {
n, err := readFullOrToEnd(r.reader, r.block[crc32Len:])
if err != nil {
return err
}
r.i = 0
r.j = crc32Len + n
blockUnit(r.block[r.i:r.j]).writeCrc()
return nil
}
func NewEncoder(block []byte) (enc *Encoder, err error) {
if block != nil && !isValidBlockLen(int64(len(block))) {
return nil, ErrInvalidBlock
}
if block == nil {
block = make([]byte, defaultCrc32BlockSize)
}
return &Encoder{block: block}, nil
}
// NewEncoderReader returns io.Reader
//
// Deprecated: no reused buffer, use NewBodyEncoder to instead.
func NewEncoderReader(r io.Reader) io.Reader {
block := make([]byte, defaultCrc32BlockSize)
return &encoderReader{block: block, reader: r}
}
func NewLimitEncoderReader(r io.Reader, limitSize int64) (enc *limitEncoderReader) {
block := make([]byte, defaultCrc32BlockSize)
enc = &limitEncoderReader{reader: r, block: block, remain: limitSize}
return
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package crc32block
import (
"io"
"sync"
"github.com/cubefs/cubefs/blobstore/util/bytespool"
)
// RequestBody is implemented http request's body.
// always io.ReadCloser.
//
// For client requests, The HTTP Client's Transport is
// responsible for calling the Close method. Necessarily call
// the Close method if the body's life-cycle control by yourself.
//
// For server requests, the Server will close the request body.
// The ServeHTTP Handler does not need to.
//
// The Body must allow Read to be called concurrently with Close.
// In particular, calling Close should unblock a Read waiting
// for input.
type RequestBody interface {
io.ReadCloser
// CodeSize returns encoded whole body size for encoding,
// or origin body size for decoding.
CodeSize(int64) int64
}
type requestBody struct {
encode bool
offset int
err error
block blockUnit
rc io.ReadCloser
blockLock chan struct{} // safely free the block
closeCh chan struct{}
closeOnce sync.Once
}
func (r *requestBody) Read(p []byte) (n int, err error) {
if r.err != nil {
return 0, r.err
}
for len(p) > 0 {
if r.offset < 0 || r.offset == r.block.length() {
if r.err = r.nextBlock(); r.err != nil {
if n > 0 {
return n, nil
}
return n, r.err
}
}
read := copy(p, r.block[r.offset:])
r.offset += read
p = p[read:]
n += read
}
return n, nil
}
func (r *requestBody) nextBlock() error {
var (
n int
err error
block blockUnit
)
if r.encode {
block = r.block[crc32Len:]
} else {
block = r.block
}
readCh := make(chan struct{})
go func() {
if _, ok := <-r.blockLock; !ok {
// closed
return
}
n, err = readFullOrToEnd(r.rc, block)
close(readCh)
r.blockLock <- struct{}{}
}()
select {
case <-r.closeCh:
return ErrReadOnClosed
case <-readCh:
}
if err != nil {
return err
}
if r.encode {
r.offset = 0
r.block = r.block[:crc32Len+n]
r.block.writeCrc()
return nil
}
if n <= crc32Len {
return ErrMismatchedCrc
}
r.offset = crc32Len
r.block = r.block[:n]
if err = r.block.check(); err != nil {
return ErrMismatchedCrc
}
return nil
}
func (r *requestBody) Close() error {
r.closeOnce.Do(func() {
block := r.block
r.block = nil
close(r.closeCh)
go func(buf []byte) {
<-r.blockLock
close(r.blockLock)
bytespool.Free(buf)
}(block)
})
return r.rc.Close()
}
func (r *requestBody) CodeSize(size int64) int64 {
if r.encode {
return EncodeSize(size, int64(r.block.length()))
}
return DecodeSize(size, int64(r.block.length()))
}
type codeSizeBody struct {
encode bool
blockLength int64
}
func (c *codeSizeBody) Read(p []byte) (n int, err error) { return 0, io.EOF }
func (c *codeSizeBody) Close() error { return nil }
func (c *codeSizeBody) CodeSize(size int64) int64 {
if c.encode {
return EncodeSize(size, c.blockLength)
}
return DecodeSize(size, c.blockLength)
}
// TODO: using resourcepool's chan-pool if block size greater than 64K.
func newRequestBody(rc io.ReadCloser, encode bool) RequestBody {
if rc == nil {
return &codeSizeBody{
encode: encode,
blockLength: gBlockSize,
}
}
lock := make(chan struct{}, 1)
lock <- struct{}{}
return &requestBody{
encode: encode,
block: bytespool.Alloc(int(gBlockSize)),
offset: -1,
rc: rc,
blockLock: lock,
closeCh: make(chan struct{}),
}
}
// NewBodyEncoder returns encoder with crc32.
//
// If rc == nil, the encoder is called just with CodeSize,
// you need not to Close it at all.
func NewBodyEncoder(rc io.ReadCloser) RequestBody {
return newRequestBody(rc, true)
}
// NewBodyDecoder returns decoder with crc32.
//
// If rc == nil, the decoder is called just with CodeSize,
// you need not to Close it at all.
func NewBodyDecoder(rc io.ReadCloser) RequestBody {
return newRequestBody(rc, false)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package crc32block
import (
"errors"
"io"
)
const (
crc32Len = 4
baseBlockBit = 12
baseBlockLen = (1 << baseBlockBit)
)
var (
ErrInvalidBlock = errors.New("crc32block: invalid block buffer")
ErrMismatchedCrc = errors.New("crc32block: mismatched checksum")
ErrReadOnClosed = errors.New("crc32block: read on closed")
)
func isValidBlockLen(blockLen int64) bool {
return blockLen > 0 && blockLen%baseBlockLen == 0
}
func blockPayload(blockLen int64) int64 {
return blockLen - crc32Len
}
// SetBlockSize set default block size
func SetBlockSize(blockSize int64) {
if !isValidBlockLen(blockSize) {
panic(ErrInvalidBlock)
}
gBlockSize = blockSize
}
func EncodeSize(size int64, blockLen int64) int64 {
if !isValidBlockLen(blockLen) {
panic(ErrInvalidBlock)
}
payload := blockPayload(blockLen)
blockCnt := (size + (payload - 1)) / payload
return size + 4*blockCnt
}
func DecodeSize(totalSize int64, blockLen int64) int64 {
if !isValidBlockLen(blockLen) {
panic(ErrInvalidBlock)
}
blockCnt := (totalSize + (blockLen - 1)) / blockLen
return totalSize - 4*blockCnt
}
func EncodeSizeWithDefualtBlock(size int64) int64 {
return EncodeSize(size, defaultCrc32BlockSize)
}
func DecodeSizeWithDefualtBlock(size int64) int64 {
return DecodeSize(size, defaultCrc32BlockSize)
}
func readFullOrToEnd(r io.Reader, buffer []byte) (n int, err error) {
nn, size := 0, len(buffer)
for n < size && err == nil {
nn, err = r.Read(buffer[n:])
n += nn
if n != 0 && err == io.EOF {
return n, nil
}
}
return n, err
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package errors
import (
"errors"
"net/http"
"github.com/cubefs/cubefs/blobstore/common/rpc"
)
var (
// 2xx
ErrExist = newError(http.StatusCreated, "Data Already Exist")
// 4xx
ErrIllegalArguments = newError(http.StatusBadRequest, "Illegal Arguments")
ErrNotFound = newError(http.StatusNotFound, "Not Found")
ErrRequestTimeout = newError(http.StatusRequestTimeout, "Request Timeout")
ErrRequestedRangeNotSatisfiable = newError(http.StatusRequestedRangeNotSatisfiable, "Request Range Not Satisfiable")
ErrRequestNotAllow = newError(http.StatusBadRequest, "Request Not Allow")
ErrReaderError = newError(499, "Reader Error")
// 5xx errUnexpected - unexpected error, requires manual intervention.
ErrUnexpected = newError(http.StatusInternalServerError, "Unexpected Error")
)
func newError(status int, msg string) *rpc.Error {
return rpc.NewError(status, "", errors.New(msg))
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package errors
import (
"net/http"
"github.com/cubefs/cubefs/blobstore/common/rpc"
)
// access 550-599
// blobnode 600-699
// scheduler 700-799
// proxy 800-899
// clusterMgr 900-999
// Error http status code for all application
type Error int
var _ rpc.HTTPError = Error(0)
// Error implements error and rpc.HTTPError
func (e Error) Error() string {
return errCodeMap[int(e)]
}
// StatusCode implements rpc.HTTPError
func (e Error) StatusCode() int {
return int(e)
}
// ErrorCode implements rpc.HTTPError
func (e Error) ErrorCode() string {
return ""
}
var errCodeMap = map[int]string{
// access
CodeAccessReadRequestBody: "access read request body",
CodeAccessUnexpect: "access unexpected error",
CodeAccessServiceDiscovery: "access client service discovery disconnect",
CodeAccessLimited: "access limited",
CodeAccessExceedSize: "access exceed object size",
// clustermgr
CodeCMUnexpect: "cm: unexpected error",
CodeLockNotAllow: "lock volume not allow",
CodeUnlockNotAllow: "unlock volume not allow",
CodeVolumeNotExist: "volume not exist",
CodeRaftPropose: "raft propose error",
CodeNoLeader: "no leader",
CodeRaftReadIndex: "raft read index error",
CodeDuplicatedMemberInfo: "duplicated member info",
CodeCMDiskNotFound: "disk not found",
CodeInvalidDiskStatus: "invalid status",
CodeChangeDiskStatusNotAllow: "not allow to change status back",
CodeConcurrentAllocVolumeUnit: "alloc volume unit concurrently",
CodeNoAvailableVolume: "no available volume",
CodeAllocVolumeInvalidParams: "alloc volume request params is invalid",
CodeOldVuidNotMatch: "update volume unit, old vuid not match",
CodeNewVuidNotMatch: "update volume unit, new vuid not match",
CodeNewDiskIDNotMatch: "update volume unit, new diskID not match",
CodeConfigArgument: "config argument marshal error",
CodeInvalidClusterID: "request params error, invalid clusterID",
CodeInvalidIDC: "request params error,invalid idc",
CodeVolumeUnitNotExist: "volume unit not exist",
CodeDiskAbnormalOrNotReadOnly: "disk is abnormal or not readonly, can't add into dropping list",
CodeStatChunkFailed: "stat blob node chunk failed",
CodeInvalidCodeMode: "request alloc volume codeMode not invalid",
CodeRetainVolumeNotAlloc: "retain volume is not alloc",
CodeDroppedDiskHasVolumeUnit: "dropped disk still has volume unit remain, migrate them firstly",
CodeNotSupportIdle: "list volume v2 not support idle status",
CodeDiskIsDropping: "dropping disk not allow change state or set readonly",
CodeRejectDeleteSystemConfig: "reject delete system config",
CodeRegisterServiceInvalidParams: "register service params is invalid",
// scheduler
CodeNotingTodo: "nothing to do",
// proxy
CodeNoAvaliableVolume: "this codemode has no avaliable volume",
CodeAllocBidFromCm: "alloc bid from clustermgr error",
CodeClusterIDNotMatch: "clusterId not match",
// blobnode
CodeInvalidParam: "blobnode: invalid params",
CodeAlreadyExist: "blobnode: entry already exist",
CodeOutOfLimit: "blobnode: out of limit",
CodeInternal: "blobnode: internal error",
CodeOverload: "blobnode: service is overload",
CodePathNotExist: "blobnode: path is not exist",
CodePathNotEmpty: "blobnode: path is not empty",
CodePathFindOnline: "blobnode: path find online disk",
CodeDiskNotFound: "disk not found",
CodeDiskBroken: "disk is broken",
CodeInvalidDiskId: "disk id is invalid",
CodeDiskNoSpace: "disk no space",
CodeVuidNotFound: "vuid not found",
CodeVUIDReadonly: "vuid readonly",
CodeVUIDRelease: "vuid released",
CodeVuidNotMatch: "vuid not match",
CodeChunkNotReadonly: "chunk must readonly",
CodeChunkNotNormal: "chunk must normal",
CodeChunkNoSpace: "chunk no space",
CodeChunkCompacting: "chunk is compacting",
CodeInvalidChunkId: "chunk id is invalid",
CodeTooManyChunks: "too many chunks",
CodeChunkInuse: "chunk in use",
CodeSizeOverBurst: "request size over limit burst",
CodeBidNotFound: "bid not found",
CodeShardSizeTooLarge: "shard size too large",
CodeShardNotMarkDelete: "shard must mark delete",
CodeShardMarkDeleted: "shard already mark delete",
CodeShardInvalidOffset: "shard offset is invalid",
CodeShardInvalidBid: "shard key bid is invalid",
CodeShardListExceedLimit: "shard list exceed the limit",
CodeDestReplicaBad: "dest replica is bad can not repair",
CodeOrphanShard: "shard is an orphan",
CodeIllegalTask: "illegal task",
CodeRequestLimited: "request limited",
}
// HTTPError make rpc.HTTPError
func HTTPError(statusCode int, errCode string, err error) error {
return rpc.NewError(statusCode, errCode, err)
}
// Error2HTTPError transfer error to rpc.HTTPError
func Error2HTTPError(err error) error {
if err == nil {
return nil
}
if e, ok := err.(rpc.HTTPError); ok {
return e
}
if code, ok := err.(Error); ok {
return code
}
return rpc.NewError(http.StatusInternalServerError, "ServerError", err)
}
// DetectCode detect code
func DetectCode(err error) int {
if err == nil {
return http.StatusOK
}
if code, ok := err.(Error); ok {
return int(code)
}
if httpErr, ok := err.(rpc.HTTPError); ok {
return httpErr.StatusCode()
}
return http.StatusInternalServerError
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"encoding/binary"
"fmt"
"strconv"
"strings"
)
// basic type for all module
type (
DiskID uint32
BlobID uint64
Vid uint32
ClusterID uint32
)
func (id DiskID) Encode() []byte {
key := make([]byte, 4)
binary.BigEndian.PutUint32(key, uint32(id))
return key
}
func (id *DiskID) Decode(b []byte) DiskID {
key := binary.BigEndian.Uint32(b)
*id = DiskID(key)
return *id
}
func (id DiskID) ToString() string {
return strconv.FormatUint(uint64(id), 10)
}
func (vid Vid) ToString() string {
return strconv.FormatUint(uint64(vid), 10)
}
func (id ClusterID) ToString() string {
return strconv.FormatUint(uint64(id), 10)
}
const seqToken = ";"
// EncodeToken encode host and vid to a string token.
func EncodeToken(host string, vid Vid) (token string) {
return fmt.Sprintf("%s%s%s", host, seqToken, strconv.FormatUint(uint64(vid), 10))
}
// DecodeToken decode host and vid from the token.
func DecodeToken(token string) (host string, vid Vid, err error) {
parts := strings.SplitN(token, seqToken, 2)
if len(parts) != 2 {
err = fmt.Errorf("invalid token %s", token)
return
}
host = parts[0]
vidU32, err := strconv.ParseUint(parts[1], 10, 32)
if err != nil {
return
}
vid = Vid(vidU32)
return
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"math"
)
// service names
const (
ServiceNameBlobNode = "BLOBNODE"
ServiceNameProxy = "PROXY"
ServiceNameScheduler = "SCHEDULER"
)
type DiskStatus uint8
// disk status
const (
DiskStatusNormal = DiskStatus(iota + 1) // 1
DiskStatusBroken // 2
DiskStatusRepairing // 3
DiskStatusRepaired // 4
DiskStatusDropped // 5
DiskStatusMax // 6
)
func (status DiskStatus) IsValid() bool {
return status >= DiskStatusNormal && status < DiskStatusMax
}
func (status DiskStatus) String() string {
switch status {
case DiskStatusNormal:
return "normal"
case DiskStatusBroken:
return "broken"
case DiskStatusRepairing:
return "repairing"
case DiskStatusRepaired:
return "repaired"
case DiskStatusDropped:
return "dropped"
default:
return "unknown"
}
}
const (
InvalidDiskID = DiskID(0)
InValidBlobID = BlobID(0)
InvalidCrc32 = uint32(0)
InvalidVid = Vid(0)
InvalidVuid = Vuid(0)
)
const (
MaxBlobID = BlobID(math.MaxUint64)
)
// volume status
type VolumeStatus uint8
func (status VolumeStatus) IsValid() bool {
return status > volumeStatusMin && status < volumeStatusMax
}
func (status VolumeStatus) String() string {
switch status {
case VolumeStatusIdle:
return "idle"
case VolumeStatusActive:
return "active"
case VolumeStatusLock:
return "lock"
case VolumeStatusUnlocking:
return "unlocking"
default:
return "unknown"
}
}
const (
volumeStatusMin = VolumeStatus(iota)
VolumeStatusIdle
VolumeStatusActive
VolumeStatusLock
VolumeStatusUnlocking
volumeStatusMax
)
// system config key,not allow delete
const (
CodeModeConfigKey = "code_mode"
VolumeReserveSizeKey = "volume_reserve_size"
VolumeChunkSizeKey = "volume_chunk_size"
)
func IsSysConfigKey(key string) bool {
switch key {
case VolumeChunkSizeKey, VolumeReserveSizeKey, CodeModeConfigKey:
return true
default:
return false
}
}
type TaskSwitch string
const (
TaskSwitchDataInspect TaskSwitch = "data_inspect"
)
func (t TaskSwitch) Valid() bool {
switch t {
case TaskSwitchDataInspect:
return true
default:
return false
}
}
func (t TaskSwitch) String() string {
return string(t)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"github.com/cubefs/cubefs/blobstore/util/errors"
)
var ErrInvalidMsg = errors.New("msg is invalid")
type DeleteStage byte
const (
InitStage DeleteStage = iota
DeleteStageMarkDelete
DeleteStageDelete
)
type BlobDeleteStage struct {
Stages map[uint8]DeleteStage `json:"stages"`
}
func (s *BlobDeleteStage) SetStage(vuidIdx uint8, stage DeleteStage) {
if s.Stages == nil {
s.Stages = make(map[uint8]DeleteStage)
}
s.Stages[vuidIdx] = stage
}
func (s *BlobDeleteStage) Stage(vuid Vuid) (DeleteStage, bool) {
stage, exist := s.Stages[vuid.Index()]
return stage, exist
}
func (s *BlobDeleteStage) Copy() BlobDeleteStage {
myCopy := BlobDeleteStage{}
myCopy.Stages = make(map[uint8]DeleteStage)
for k, v := range s.Stages {
myCopy.Stages[k] = v
}
return myCopy
}
type DeleteMsg struct {
ClusterID ClusterID `json:"cluster_id"`
Bid BlobID `json:"bid"`
Vid Vid `json:"vid"`
Retry int `json:"retry"`
Time int64 `json:"time"`
ReqId string `json:"req_id"`
BlobDelStages BlobDeleteStage `json:"blob_del_stages"`
}
func (msg *DeleteMsg) IsValid() bool {
if msg.Bid == InValidBlobID {
return false
}
if msg.Vid == InvalidVid {
return false
}
return true
}
func (msg *DeleteMsg) SetDeleteStage(stage BlobDeleteStage) {
for idx, s := range stage.Stages {
msg.BlobDelStages.SetStage(idx, s)
}
}
type ShardRepairMsg struct {
ClusterID ClusterID `json:"cluster_id"`
Bid BlobID `json:"bid"`
Vid Vid `json:"vid"`
BadIdx []uint8 `json:"bad_idx"`
Retry int `json:"retry"`
Reason string `json:"reason"`
ReqId string `json:"req_id"`
}
func (msg *ShardRepairMsg) IsValid() bool {
if msg.Bid == InValidBlobID {
return false
}
if msg.Vid == InvalidVid {
return false
}
if len(msg.BadIdx) == 0 {
return false
}
return true
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"sync"
"github.com/cubefs/cubefs/blobstore/common/codemode"
"github.com/cubefs/cubefs/blobstore/util/errors"
)
var (
ErrTaskPaused = errors.New("task has paused")
ErrTaskEmpty = errors.New("no task to run")
)
const (
// TaskRenewalPeriodS + RenewalTimeoutS < TaskLeaseExpiredS
TaskRenewalPeriodS = 5 // worker alive tasks renewal period
RenewalTimeoutS = 1 // timeout of worker task renewal
TaskLeaseExpiredS = 10 // task lease duration in scheduler
)
type TaskType string
const (
TaskTypeDiskRepair TaskType = "disk_repair"
TaskTypeBalance TaskType = "balance"
TaskTypeDiskDrop TaskType = "disk_drop"
TaskTypeManualMigrate TaskType = "manual_migrate"
TaskTypeVolumeInspect TaskType = "volume_inspect"
TaskTypeShardRepair TaskType = "shard_repair"
TaskTypeBlobDelete TaskType = "blob_delete"
)
func (t TaskType) Valid() bool {
switch t {
case TaskTypeDiskRepair, TaskTypeBalance, TaskTypeDiskDrop, TaskTypeManualMigrate,
TaskTypeVolumeInspect, TaskTypeShardRepair, TaskTypeBlobDelete:
return true
default:
return false
}
}
func (t TaskType) String() string {
return string(t)
}
type VunitLocation struct {
Vuid Vuid `json:"vuid" bson:"vuid"`
Host string `json:"host" bson:"host"`
DiskID DiskID `json:"disk_id" bson:"disk_id"`
}
// for task check
func CheckVunitLocations(locations []VunitLocation) bool {
if len(locations) == 0 {
return false
}
for _, l := range locations {
if l.Vuid == InvalidVuid || l.Host == "" || l.DiskID == InvalidDiskID {
return false
}
}
return true
}
type MigrateState uint8
const (
MigrateStateInited MigrateState = iota + 1
MigrateStatePrepared
MigrateStateWorkCompleted
MigrateStateFinished
MigrateStateFinishedInAdvance
)
type MigrateTask struct {
TaskID string `json:"task_id"` // task id
TaskType TaskType `json:"task_type"` // task type
State MigrateState `json:"state"` // task state
SourceIDC string `json:"source_idc"` // source idc
SourceDiskID DiskID `json:"source_disk_id"` // source disk id
SourceVuid Vuid `json:"source_vuid"` // source volume unit id
Sources []VunitLocation `json:"sources"` // source volume units location
CodeMode codemode.CodeMode `json:"code_mode"` // codemode
Destination VunitLocation `json:"destination"` // destination volume unit location
Ctime string `json:"ctime"` // create time
MTime string `json:"mtime"` // modify time
FinishAdvanceReason string `json:"finish_advance_reason"`
// task migrate chunk direct download first,if fail will recover chunk by ec repair
ForbiddenDirectDownload bool `json:"forbidden_direct_download"`
WorkerRedoCnt uint8 `json:"worker_redo_cnt"` // worker redo task count
}
func (t *MigrateTask) Vid() Vid {
return t.SourceVuid.Vid()
}
func (t *MigrateTask) GetSources() []VunitLocation {
return t.Sources
}
func (t *MigrateTask) GetDestination() VunitLocation {
return t.Destination
}
func (t *MigrateTask) SetDestination(dest VunitLocation) {
t.Destination = dest
}
func (t *MigrateTask) DestinationDiskID() DiskID {
return t.Destination.DiskID
}
func (t *MigrateTask) GetSourceDiskID() DiskID {
return t.SourceDiskID
}
func (t *MigrateTask) Running() bool {
return t.State == MigrateStatePrepared || t.State == MigrateStateWorkCompleted
}
func (t *MigrateTask) Copy() *MigrateTask {
task := &MigrateTask{}
*task = *t
dst := make([]VunitLocation, len(t.Sources))
copy(dst, t.Sources)
task.Sources = dst
return task
}
func (t *MigrateTask) IsValid() bool {
return t.TaskType.Valid() && t.CodeMode.IsValid() &&
CheckVunitLocations(t.Sources) &&
CheckVunitLocations([]VunitLocation{t.Destination})
}
type VolumeInspectCheckPoint struct {
StartVid Vid `json:"start_vid"` // min vid in current batch volumes
Ctime string `json:"ctime"`
}
type VolumeInspectTask struct {
TaskID string `json:"task_id"`
Mode codemode.CodeMode `json:"mode"`
Replicas []VunitLocation `json:"replicas"`
}
func (t *VolumeInspectTask) IsValid() bool {
return t.Mode.IsValid() && CheckVunitLocations(t.Replicas)
}
type MissedShard struct {
Vuid Vuid `json:"vuid"`
Bid BlobID `json:"bid"`
}
type VolumeInspectRet struct {
TaskID string `json:"task_id"`
InspectErrStr string `json:"inspect_err_str"` // inspect run success or not
MissedShards []*MissedShard `json:"missed_shards"`
}
func (inspect *VolumeInspectRet) Err() error {
if len(inspect.InspectErrStr) == 0 {
return nil
}
return errors.New(inspect.InspectErrStr)
}
type ShardRepairTask struct {
Bid BlobID `json:"bid"`
CodeMode codemode.CodeMode `json:"code_mode"`
Sources []VunitLocation `json:"sources"`
BadIdxs []uint8 `json:"bad_idxs"` // TODO: BadIdxes
Reason string `json:"reason"`
}
func (task *ShardRepairTask) IsValid() bool {
return task.CodeMode.IsValid() && CheckVunitLocations(task.Sources)
}
// TaskStatistics thread-unsafe task statistics.
type TaskStatistics struct {
DoneSize uint64 `json:"done_size"`
DoneCount uint64 `json:"done_count"`
TotalSize uint64 `json:"total_size"`
TotalCount uint64 `json:"total_count"`
Progress uint64 `json:"progress"`
}
// TaskProgress migrate task running progress.
type TaskProgress interface {
Total(size, count uint64) // reset total size and count.
Do(size, count uint64) // update progress.
Done() TaskStatistics // returns newest statistics.
}
// NewTaskProgress returns thread-safe task progress.
func NewTaskProgress() TaskProgress {
return &taskProgress{}
}
type taskProgress struct {
mu sync.Mutex
st TaskStatistics
}
func (p *taskProgress) Total(size, count uint64) {
p.mu.Lock()
st := &p.st
st.TotalSize = size
st.TotalCount = count
if st.TotalSize == 0 {
st.Progress = 100
} else {
st.Progress = (st.DoneSize * 100) / st.TotalSize
}
p.mu.Unlock()
}
func (p *taskProgress) Do(size, count uint64) {
p.mu.Lock()
st := &p.st
st.DoneSize += size
st.DoneCount += count
if st.TotalSize == 0 {
st.Progress = 100
} else {
st.Progress = (st.DoneSize * 100) / st.TotalSize
}
p.mu.Unlock()
}
func (p *taskProgress) Done() TaskStatistics {
p.mu.Lock()
st := p.st
p.mu.Unlock()
return st
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"errors"
"strconv"
)
type (
Vuid uint64
VuidPrefix uint64
)
const (
MinEpoch = 1
MaxEpoch = 16777215
MinIndex = 0
MaxIndex = 255
)
func (vu Vuid) IsValid() bool {
return vu > InvalidVuid && IsValidEpoch(vu.Epoch()) && IsValidIndex(vu.Index())
}
func NewVuid(vid Vid, idx uint8, epoch uint32) (Vuid, error) {
if !IsValidEpoch(epoch) {
err := errors.New("fail to new vuid,Epoch is overflow")
return 0, err
}
u64 := uint64(vid)<<32 + uint64(idx)<<24 + uint64(epoch)
return Vuid(u64), nil
}
func EncodeVuidPrefix(vid Vid, idx uint8) VuidPrefix {
u64 := uint64(vid)<<32 + uint64(idx)<<24
return VuidPrefix(u64)
}
func EncodeVuid(v VuidPrefix, epoch uint32) Vuid {
u64 := uint64(v) + uint64(epoch)
return Vuid(u64)
}
func (v Vuid) Vid() Vid {
return Vid(v & 0xffffffff00000000 >> 32)
}
func (v Vuid) ToString() string {
return strconv.FormatUint(uint64(v), 10)
}
func (v Vuid) Index() uint8 {
return uint8(v & 0xff000000 >> 24)
}
func (v Vuid) Epoch() uint32 {
return uint32(v & 0xffffff)
}
func (v Vuid) VuidPrefix() VuidPrefix {
vuidPre := uint64(v) - uint64(v.Epoch())
return VuidPrefix(vuidPre)
}
func (v VuidPrefix) Vid() Vid {
return Vid(v & 0xffffffff00000000 >> 32)
}
func (v VuidPrefix) Index() uint8 {
return uint8(v & 0xff000000 >> 24)
}
func IsValidEpoch(epoch uint32) bool {
return epoch <= MaxEpoch && epoch >= MinEpoch
}
func IsValidIndex(index uint8) bool {
return index <= MaxIndex && index >= MinIndex
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package resourcepool
import (
"runtime"
"sync"
"sync/atomic"
"time"
)
// cache in chan pool will not be released by runtime.GC().
// pool chan length dynamicly changes by EMA(Exponential Moving Average).
// redundant buffers will released by GC.
const maxMemorySize = 1 << 32 // 4G
// type SliceHeader is 24 bytes.
// buffered channel malloc memory in one call.
// `makechan` see more at: https://github.com/golang/go/blob/master/src/runtime/chan.go
//
// limit max channel memory to 96MB (24 * (1<<22)),
// can reduce to 32MB if using type *[]byte.
const maxChanSize = 1 << 22 // 4m
var releaseInterval int64 = int64(time.Minute) * 2
// SetReleaseInterval set release interval duration
func SetReleaseInterval(duration time.Duration) {
if duration > time.Millisecond*100 {
atomic.StoreInt64(&releaseInterval, int64(duration))
}
}
type chPool struct {
chBuffer chan []byte
newBuffer func() []byte
capacity int
concurrence int32
closeCh chan struct{}
closeOnce sync.Once
}
// NewChanPool return Pool with capacity, no limit if capacity is negative
func NewChanPool(newFunc func() []byte, capacity int) Pool {
chCap := capacity
if chCap < 0 {
buf := newFunc()
chCap = maxMemorySize / len(buf)
}
if chCap > maxChanSize {
chCap = maxChanSize
}
pool := &chPool{
chBuffer: make(chan []byte, chCap),
newBuffer: newFunc,
capacity: capacity,
closeCh: make(chan struct{}),
}
runtime.SetFinalizer(pool, func(p *chPool) {
p.closeOnce.Do(func() {
close(p.closeCh)
})
})
go pool.loopRelease()
return pool
}
// loopRelease release redundant buffers in chan.
// check EMA concurrence per round of release interval duration.
// release the redundant buffers per release interval duration.
//
// reserve 30% redundancy of capacity
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | length of buffer chan | concurrence |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | redundant | capacity | reserved |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | to release | buffers keep in memory |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
func (p *chPool) loopRelease() {
const emaRound = 5
ticker := time.NewTicker(time.Duration(atomic.LoadInt64(&releaseInterval)) / emaRound)
defer ticker.Stop()
var (
turn int
capacity int32
)
for {
select {
case <-p.closeCh:
for {
select {
case <-p.chBuffer:
default:
return
}
}
case <-ticker.C:
nowConc := atomic.LoadInt32(&p.concurrence)
capacity = ema(nowConc, capacity)
if turn = (turn + 1) % emaRound; turn != 0 {
continue
}
capa := capacity * 13 / 10
redundant := len(p.chBuffer) + int(nowConc-capa)
if redundant <= 0 {
continue
}
has := true
for ii := 0; has && ii < redundant; ii++ {
select {
case <-p.chBuffer:
default:
has = false
}
}
}
}
}
func (p *chPool) Get() (interface{}, error) {
atomic.AddInt32(&p.concurrence, 1)
select {
case buf := <-p.chBuffer:
return buf, nil
default:
return p.newBuffer(), nil
}
}
func (p *chPool) Put(x interface{}) {
buf, ok := x.([]byte)
if !ok {
return
}
select {
case p.chBuffer <- buf:
default:
}
atomic.AddInt32(&p.concurrence, -1)
}
func (p *chPool) Cap() int {
return p.capacity
}
func (p *chPool) Len() int {
return int(atomic.LoadInt32(&p.concurrence))
}
func (p *chPool) Idle() int {
return len(p.chBuffer)
}
func ema(val, lastVal int32) int32 {
return (val*2 + lastVal*8) / 10
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package resourcepool
import (
"errors"
"sort"
)
// ErrNoSuitableSizeClass no suitable pool of size
var ErrNoSuitableSizeClass = errors.New("no suitable size class")
// zero bytes, high performance at the 16KB, see more in the benchmark:
// BenchmarkZero/4MB-16KB-4 13338 88378 ns/op
// BenchmarkZero/8MB-16KB-4 6670 183987 ns/op
// BenchmarkZero/16MB-16KB-4 1926 590422 ns/op
const zeroLen = 1 << 14
var zero = make([]byte, zeroLen)
// MemPool reused buffer pool
type MemPool struct {
pool []Pool
poolSize []int
}
// Status memory pools status
type Status []PoolStatus
// PoolStatus status of the pool
type PoolStatus struct {
Size int `json:"size"`
Capacity int `json:"capacity"`
Running int `json:"running"`
Idle int `json:"idle"`
}
// NewMemPool returns a MemPool within chan pool
func NewMemPool(sizeClasses map[int]int) *MemPool {
return NewMemPoolWith(sizeClasses, func(size, capacity int) Pool {
return NewChanPool(func() []byte {
return make([]byte, size)
}, capacity)
})
}
// NewMemPoolWith new MemPool with size-class and self-defined pool
func NewMemPoolWith(sizeClasses map[int]int, newPool func(size, capacity int) Pool) *MemPool {
pool := make([]Pool, 0, len(sizeClasses))
poolSize := make([]int, 0, len(sizeClasses))
for sizeClass := range sizeClasses {
if sizeClass > 0 {
poolSize = append(poolSize, sizeClass)
}
}
sort.Ints(poolSize)
for _, sizeClass := range poolSize {
pool = append(pool, newPool(sizeClass, sizeClasses[sizeClass]))
}
return &MemPool{
pool: pool,
poolSize: poolSize,
}
}
// Get return a suitable buffer
func (p *MemPool) Get(size int) ([]byte, error) {
for idx, ps := range p.poolSize {
if size <= ps {
buf, err := p.pool[idx].Get()
if err != nil {
return nil, err
}
buff := buf.([]byte)
return buff[:size], nil
}
}
return nil, ErrNoSuitableSizeClass
}
// Alloc return a buffer, make a new if oversize
func (p *MemPool) Alloc(size int) ([]byte, error) {
buf, err := p.Get(size)
if err == ErrNoSuitableSizeClass {
return make([]byte, size), nil
}
return buf, err
}
// Put adds x to the pool, appropriately resize
func (p *MemPool) Put(b []byte) error {
sizeClass := cap(b)
b = b[0:sizeClass]
for ii := len(p.poolSize) - 1; ii >= 0; ii-- {
if sizeClass >= p.poolSize[ii] {
b = b[0:p.poolSize[ii]]
p.pool[ii].Put(b)
return nil
}
}
return ErrNoSuitableSizeClass
}
// Zero clean up the buffer b to zero bytes
func (p *MemPool) Zero(b []byte) {
Zero(b)
}
// Status returns status of memory pool
func (p *MemPool) Status() Status {
st := make(Status, len(p.poolSize))
for idx, size := range p.poolSize {
pool := p.pool[idx]
st[idx] = PoolStatus{
Size: size,
Capacity: pool.Cap(),
Running: pool.Len(),
Idle: pool.Idle(),
}
}
return st
}
// Zero clean up the buffer b to zero bytes
func Zero(b []byte) {
for len(b) > 0 {
n := copy(b, zero)
b = b[n:]
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package resourcepool
// sync.Pool cache will be released by runtime.GC()
// see sync/pool.go: runtime_registerPoolCleanup(poolCleanup)
import (
"errors"
"sync"
"sync/atomic"
)
// ErrPoolLimit pool elements exceed its capacity
var ErrPoolLimit = errors.New("resource pool limit")
// Pool resource pool support for sync.pool and capacity limit
// release resource if no used anymore
// no limit if capacity is negative
type Pool interface {
// Get return nil and error if exceed pool's capacity
Get() (interface{}, error)
Put(x interface{})
Cap() int
Len() int
// Idle return cached idle objects in pool.
Idle() int
}
// sync pool Idle return -1 if no limit
type pool struct {
sp sync.Pool
capacity int32
current int32
}
// NewPool return Pool with capacity, no limit if capacity is negative
func NewPool(newFunc func() interface{}, capacity int) Pool {
return &pool{
sp: sync.Pool{New: newFunc},
capacity: int32(capacity),
current: int32(0),
}
}
func (p *pool) Get() (interface{}, error) {
current := atomic.AddInt32(&p.current, 1)
if p.capacity >= 0 && current > p.capacity {
atomic.AddInt32(&p.current, -1)
return nil, ErrPoolLimit
}
return p.sp.Get(), nil
}
func (p *pool) Put(x interface{}) {
p.sp.Put(x)
atomic.AddInt32(&p.current, -1)
}
func (p *pool) Cap() int {
return int(p.capacity)
}
func (p *pool) Len() int {
return int(atomic.LoadInt32(&p.current))
}
func (p *pool) Idle() int {
if p.capacity < 0 {
return -1
}
return p.Cap() - p.Len()
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"encoding/base64"
"encoding/json"
"fmt"
"io"
"reflect"
"strconv"
"strings"
"github.com/cubefs/cubefs/blobstore/util/bytespool"
"github.com/cubefs/cubefs/blobstore/util/log"
)
type (
parserKey struct {
PkgPath string
Name string
FieldName string
}
parserVal struct {
Name string
Opt struct {
Ignore bool // "-"
Omitempty bool // ",omitempty"
Base64 bool // ",base64"
}
}
)
var registeredParsers map[parserKey]parserVal
func init() {
registeredParsers = make(map[parserKey]parserVal)
}
// RegisterArgsParser regist your argument need parse in
// uri, query, form, or postform.
// the tags is sorted.
// NOTE: the function is thread-unsafe.
func RegisterArgsParser(args interface{}, tags ...string) {
if args == nil {
return
}
if _, ok := args.(Parser); ok {
return
}
typ := reflect.TypeOf(args)
val := reflect.ValueOf(args)
if typ.Kind() != reflect.Ptr {
log.Panicf("args(%s) must be pointer", typ.Name())
}
typ = typ.Elem()
if typ.Kind() != reflect.Struct {
log.Panicf("args(%s) reference must be struct", typ.Name())
}
val = val.Elem()
t := val.Type()
for i := 0; i < val.NumField(); i++ {
ft := t.Field(i)
pVal := parserVal{
Name: strings.ToLower(ft.Name),
}
for _, tag := range tags {
tagStr := ft.Tag.Get(tag)
if tagStr != "" {
ts := strings.Split(tagStr, ",")
if ts[0] == "-" {
pVal.Opt.Ignore = true
break
}
if ts[0] != "" {
pVal.Name = ts[0]
}
for _, t := range ts[1:] {
switch t {
case "omitempty":
pVal.Opt.Omitempty = true
case "base64":
pVal.Opt.Base64 = true
default:
}
}
break
}
}
pKey := parserKey{
PkgPath: typ.PkgPath(),
Name: typ.Name(),
FieldName: ft.Name,
}
registeredParsers[pKey] = pVal
log.Infof("register args field:%+v val:%+v", pKey, pVal)
}
}
func parseArgs(c *Context, args interface{}, opts ...ServerOption) error {
if args == nil {
return nil
}
opt := c.opts
if len(opts) > 0 {
opt = c.opts.copy()
for _, o := range opts {
o.apply(opt)
}
}
if opt.argsBody {
size, err := c.RequestLength()
if err != nil {
return err
}
if arg, ok := args.(UnmarshalerFrom); ok {
return arg.UnmarshalFrom(io.LimitReader(c.Request.Body, int64(size)))
}
buf := bytespool.Alloc(size)
defer bytespool.Free(buf)
if _, err = io.ReadFull(c.Request.Body, buf); err != nil {
return err
}
if arg, ok := args.(Unmarshaler); ok {
return arg.Unmarshal(buf)
}
return json.Unmarshal(buf, args)
}
if !opt.hasArgs() {
return nil
}
getter := func(fKey string) string {
if opt.argsURI {
if val := c.Param.ByName(fKey); val != "" {
return val
}
}
if opt.argsQuery {
if val := c.Request.URL.Query().Get(fKey); val != "" {
return val
}
}
if opt.argsForm {
if val := c.Request.Form.Get(fKey); val != "" {
return val
}
}
if opt.argsPostForm {
if val := c.Request.PostForm.Get(fKey); val != "" {
return val
}
}
return ""
}
if arg, ok := args.(Parser); ok {
return arg.Parse(getter)
}
typ := reflect.TypeOf(args)
val := reflect.ValueOf(args)
if typ.Kind() != reflect.Ptr {
return fmt.Errorf("args(%s) must be pointer", typ.Name())
}
typ = typ.Elem()
if typ.Kind() != reflect.Struct {
return fmt.Errorf("args(%s) reference must be struct", typ.Name())
}
val = val.Elem()
t := val.Type()
for i := 0; i < val.NumField(); i++ {
ft := t.Field(i)
pVal, ok := registeredParsers[parserKey{
PkgPath: typ.PkgPath(),
Name: typ.Name(),
FieldName: ft.Name,
}]
if !ok {
pVal = parserVal{Name: strings.ToLower(ft.Name)}
}
if pVal.Opt.Ignore {
continue
}
fVal := getter(pVal.Name)
if fVal == "" {
if pVal.Opt.Omitempty {
continue
}
return fmt.Errorf("args(%s) field(%s) do not omit", typ.Name(), ft.Name)
}
if pVal.Opt.Base64 {
switch len(fVal) & 3 {
case 2:
fVal += "=="
case 3:
fVal += "="
default:
}
b, err := base64.URLEncoding.DecodeString(fVal)
if err != nil {
return fmt.Errorf("args(%s) field(%s) invalid base64(%s)", typ.Name(), ft.Name, fVal)
}
fVal = string(b)
}
fv := val.Field(i)
if err := parseValue(fv, fVal); err != nil {
return err
}
}
return nil
}
func parseValue(val reflect.Value, str string) (err error) {
var (
bv bool
iv int64
uv uint64
fv float64
)
RETRY:
switch val.Kind() {
case reflect.Bool:
bv, err = strconv.ParseBool(str)
val.SetBool(bv)
case reflect.Int:
iv, err = strconv.ParseInt(str, 10, 0)
val.SetInt(iv)
case reflect.Int8:
iv, err = strconv.ParseInt(str, 10, 8)
val.SetInt(iv)
case reflect.Int16:
iv, err = strconv.ParseInt(str, 10, 16)
val.SetInt(iv)
case reflect.Int32:
iv, err = strconv.ParseInt(str, 10, 32)
val.SetInt(iv)
case reflect.Int64:
iv, err = strconv.ParseInt(str, 10, 64)
val.SetInt(iv)
case reflect.Uint:
uv, err = strconv.ParseUint(str, 10, 0)
val.SetUint(uv)
case reflect.Uint8:
uv, err = strconv.ParseUint(str, 10, 8)
val.SetUint(uv)
case reflect.Uint16:
uv, err = strconv.ParseUint(str, 10, 16)
val.SetUint(uv)
case reflect.Uint32:
uv, err = strconv.ParseUint(str, 10, 32)
val.SetUint(uv)
case reflect.Uint64:
uv, err = strconv.ParseUint(str, 10, 64)
val.SetUint(uv)
case reflect.Float32:
fv, err = strconv.ParseFloat(str, 32)
val.SetFloat(fv)
case reflect.Float64:
fv, err = strconv.ParseFloat(str, 64)
val.SetFloat(fv)
case reflect.String:
val.SetString(str)
case reflect.Uintptr:
uv, err = strconv.ParseUint(str, 10, 64)
val.SetUint(uv)
case reflect.Ptr:
elem := reflect.New(val.Type().Elem())
val.Set(elem)
val = elem.Elem()
goto RETRY
case reflect.Slice:
if val.Type().Elem().Kind() == reflect.Uint8 {
val.SetBytes([]byte(str))
} else {
return fmt.Errorf("unsupported type(%s) of slice", val.Type().Elem().Kind().String())
}
default:
return fmt.Errorf("unsupported type(%s)", val.Kind().String())
}
return
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package auth
import (
"bytes"
"crypto/md5"
"encoding/base64"
"encoding/binary"
"errors"
"net/http"
)
const (
// md5 need 16 byte
TokenKeyLenth = 16
// #nosec G101
TokenHeaderKey = "BLOB-STORE-AUTH-TOKEN"
)
var errMismatchToken = errors.New("mismatch token")
type Config struct {
EnableAuth bool `json:"enable_auth"`
Secret string `json:"secret"`
}
// simply: use timestamp as a token calculate param
type authInfo struct {
timestamp int64
token []byte
// other auth content
others []byte
}
func encodeAuthInfo(info *authInfo) (ret string, err error) {
w := bytes.NewBuffer([]byte{})
if err = binary.Write(w, binary.LittleEndian, &info.timestamp); err != nil {
return
}
if err = binary.Write(w, binary.LittleEndian, &info.token); err != nil {
return
}
return base64.URLEncoding.EncodeToString(w.Bytes()), nil
}
func decodeAuthInfo(encodeStr string) (info *authInfo, err error) {
info = new(authInfo)
b, err := base64.URLEncoding.DecodeString(encodeStr)
if err != nil {
return
}
info.token = make([]byte, TokenKeyLenth)
r := bytes.NewBuffer(b)
if err = binary.Read(r, binary.LittleEndian, &info.timestamp); err != nil {
return
}
if err = binary.Read(r, binary.LittleEndian, &info.token); err != nil {
return
}
return
}
// calculate auth token with params and secret
func calculate(info *authInfo, secret []byte) (err error) {
hash := md5.New()
b := make([]byte, 8)
binary.LittleEndian.PutUint64(b, uint64(info.timestamp))
hash.Write(info.others)
hash.Write(b)
hash.Write(secret)
info.token = hash.Sum(nil)
return
}
// verify auth token with params and secret
func verify(info *authInfo, secret []byte) (err error) {
checkAuthInfo := &authInfo{timestamp: info.timestamp, others: info.others}
calculate(checkAuthInfo, secret)
if !bytes.Equal(checkAuthInfo.token, info.token) {
return errMismatchToken
}
return
}
func genEncodeStr(req *http.Request) []byte {
calStr := req.URL.Path + req.URL.RawQuery
return []byte(calStr)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package auth
import (
"net/http"
)
type AuthHandler struct {
Secret []byte
}
func NewAuthHandler(cfg *Config) *AuthHandler {
if cfg.EnableAuth {
if cfg.Secret == "" {
panic("auth secret can not be nil")
}
return &AuthHandler{
Secret: []byte(cfg.Secret),
}
}
return nil
}
func (self *AuthHandler) Handler(w http.ResponseWriter, req *http.Request, f func(http.ResponseWriter, *http.Request)) {
token := req.Header.Get(TokenHeaderKey)
if token == "" {
w.WriteHeader(http.StatusForbidden)
return
}
info, err := decodeAuthInfo(token)
if err != nil {
w.WriteHeader(http.StatusForbidden)
return
}
info.others = genEncodeStr(req)
err = verify(info, self.Secret)
if err != nil && err == errMismatchToken {
w.WriteHeader(http.StatusForbidden)
return
}
f(w, req)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package auth
import (
"net/http"
"time"
)
type AuthTransport struct {
Secret []byte
Tr http.RoundTripper
}
func NewAuthTransport(tr http.RoundTripper, cfg *Config) http.RoundTripper {
if cfg.EnableAuth {
if cfg.Secret == "" {
panic("auth secret can not be nil")
}
return &AuthTransport{
Secret: []byte(cfg.Secret),
Tr: tr,
}
}
return nil
}
// a simple auth token
func (self *AuthTransport) RoundTrip(req *http.Request) (resp *http.Response, err error) {
now := time.Now().Unix()
info := &authInfo{timestamp: now, others: genEncodeStr(req)}
err = calculate(info, self.Secret)
if err != nil {
return self.Tr.RoundTrip(req)
}
token, err := encodeAuthInfo(info)
if err != nil {
return self.Tr.RoundTrip(req)
}
req.Header.Set(TokenHeaderKey, token)
return self.Tr.RoundTrip(req)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"net/http"
"github.com/cubefs/cubefs/blobstore/common/crc32block"
)
type crcDecoder struct{}
var _ ProgressHandler = (*crcDecoder)(nil)
func (*crcDecoder) Handler(w http.ResponseWriter, req *http.Request, f func(http.ResponseWriter, *http.Request)) {
if req.Header.Get(HeaderCrcEncoded) != "" && w.Header().Get(HeaderAckCrcEncoded) == "" {
if size := req.ContentLength; size > 0 && req.Body != nil {
decoder := crc32block.NewBodyDecoder(req.Body)
req.ContentLength = decoder.CodeSize(size)
req.Body = decoder
}
w.Header().Set(HeaderAckCrcEncoded, "1")
}
f(w, req)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"net"
"net/http"
"time"
"github.com/cubefs/cubefs/blobstore/common/rpc/auth"
)
// TransportConfig http transport config
type TransportConfig struct {
// DialTimeoutMs dial timeout in milliseconds
DialTimeoutMs int64 `json:"dial_timeout_ms"`
// ResponseHeaderTimeoutMs response header timeout after send the request
ResponseHeaderTimeoutMs int64 `json:"response_header_timeout_ms"`
MaxConnsPerHost int `json:"max_conns_per_host"`
MaxIdleConns int `json:"max_idle_conns"`
MaxIdleConnsPerHost int `json:"max_idle_conns_per_host"`
// IdleConnTimeout is the maximum amount of time an idle
// (keep-alive) connection will remain idle before closing
// itself.Zero means no limit.
IdleConnTimeoutMs int64 `json:"idle_conn_timeout_ms"`
// DisableCompression, if true, prevents the Transport from
// requesting compression with an "Accept-Encoding: gzip"
DisableCompression bool `json:"disable_compression"`
// auth config
Auth auth.Config `json:"auth"`
}
// Default returns default transport if none setting.
// Disable Auth config.
func (tc TransportConfig) Default() TransportConfig {
noAuth := tc
noAuth.Auth = auth.Config{}
none := TransportConfig{}
if noAuth == none {
return TransportConfig{
MaxConnsPerHost: 10,
MaxIdleConns: 1000,
MaxIdleConnsPerHost: 10,
IdleConnTimeoutMs: 10 * 1000,
Auth: tc.Auth,
}
}
return tc
}
// NewTransport returns http transport
func NewTransport(cfg *TransportConfig) http.RoundTripper {
tr := &http.Transport{
Proxy: http.ProxyFromEnvironment,
MaxConnsPerHost: cfg.MaxConnsPerHost,
MaxIdleConns: cfg.MaxIdleConns,
MaxIdleConnsPerHost: cfg.MaxIdleConnsPerHost,
IdleConnTimeout: time.Duration(cfg.IdleConnTimeoutMs) * time.Millisecond,
ResponseHeaderTimeout: time.Duration(cfg.ResponseHeaderTimeoutMs) * time.Millisecond,
DisableCompression: cfg.DisableCompression,
WriteBufferSize: 1 << 16,
ReadBufferSize: 1 << 16,
}
tr.DialContext = (&net.Dialer{
Timeout: time.Duration(cfg.DialTimeoutMs) * time.Millisecond,
KeepAlive: 30 * time.Second,
}).DialContext
if cfg.Auth.EnableAuth {
authTr := auth.NewAuthTransport(tr, &cfg.Auth)
if authTr != nil {
return authTr
}
}
return tr
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"bufio"
"fmt"
"io"
"math"
"net"
"net/http"
"strconv"
"strings"
"sync"
"github.com/julienschmidt/httprouter"
)
const (
abortIndex int8 = math.MaxInt8 >> 1
)
var jsonNull = [4]byte{'n', 'u', 'l', 'l'}
// Context handler context with http variables
type Context struct {
opts *serverOptions
Param httprouter.Params
Request *http.Request
Writer http.ResponseWriter
// pass key/value in whole request
mu sync.RWMutex
Meta map[string]interface{}
wroteHeader bool
// interceptors control
index int8
handlers []HandlerFunc
}
// ArgsBody args in body
func (c *Context) ArgsBody(args interface{}) error {
return c.ParseArgs(args, OptArgsBody())
}
// ArgsURI args in uri
func (c *Context) ArgsURI(args interface{}) error {
return c.ParseArgs(args, OptArgsURI())
}
// ArgsQuery args in query
func (c *Context) ArgsQuery(args interface{}) error {
return c.ParseArgs(args, OptArgsQuery())
}
// ArgsForm args in form
func (c *Context) ArgsForm(args interface{}) error {
return c.ParseArgs(args, OptArgsForm())
}
// ArgsPostForm args in post form
func (c *Context) ArgsPostForm(args interface{}) error {
return c.ParseArgs(args, OptArgsPostForm())
}
// ParseArgs reflect param to args
func (c *Context) ParseArgs(args interface{}, opts ...ServerOption) error {
if err := parseArgs(c, args, opts...); err != nil {
return NewError(http.StatusBadRequest, "Argument", err)
}
return nil
}
// RequestLength read request body length
func (c *Context) RequestLength() (int, error) {
cl := c.Request.ContentLength
if cl < 0 {
return 0, fmt.Errorf("Unknown content length in request")
}
return int(cl), nil
}
// Next should be used only inside interceptor.
// It executes the pending handlers inside the calling handler.
func (c *Context) Next() {
c.index++
for c.index < int8(len(c.handlers)) {
c.handlers[c.index](c)
c.index++
}
}
// IsAborted return aborted or not
func (c *Context) IsAborted() bool {
return c.index >= abortIndex
}
// Abort the next handlers
func (c *Context) Abort() {
c.index = abortIndex
}
// AbortWithStatus abort with status
func (c *Context) AbortWithStatus(statusCode int) {
c.RespondStatus(statusCode)
c.Abort()
}
// AbortWithStatusJSON abort with status and response data
func (c *Context) AbortWithStatusJSON(statusCode int, obj interface{}) {
c.RespondStatusData(statusCode, obj)
c.Abort()
}
// AbortWithError abort with error
func (c *Context) AbortWithError(err error) {
c.RespondError(err)
c.Abort()
}
// Respond response 200, and Content-Length: 0
func (c *Context) Respond() {
c.Writer.Header().Set(HeaderContentLength, "0")
c.RespondStatus(http.StatusOK)
}
// RespondStatus response status code
func (c *Context) RespondStatus(statusCode int) {
c.Writer.WriteHeader(statusCode)
c.wroteHeader = true
}
// RespondError response error
func (c *Context) RespondError(err error) {
httpErr := Error2HTTPError(err)
if httpErr == nil {
c.Respond()
return
}
c.RespondStatusData(httpErr.StatusCode(), errorResponse{
Error: httpErr.Error(),
Code: httpErr.ErrorCode(),
})
}
// RespondJSON response json
func (c *Context) RespondJSON(obj interface{}) {
c.RespondStatusData(http.StatusOK, obj)
}
// RespondStatusData response data with code
func (c *Context) RespondStatusData(statusCode int, obj interface{}) {
body, err := marshalObj(obj)
if err != nil {
c.RespondError(err)
return
}
c.RespondWithReader(statusCode, body.ContentLength, body.ContentType, body.Body, nil)
}
// RespondWith response with code, content-type, bytes
func (c *Context) RespondWith(statusCode int, contentType string, body []byte) {
c.Writer.Header().Set(HeaderContentType, contentType)
c.Writer.Header().Set(HeaderContentLength, strconv.Itoa(len(body)))
c.Writer.WriteHeader(statusCode)
c.wroteHeader = true
c.Writer.Write(body)
}
// RespondWithReader response with code, content-length, content-type, an io.Reader and extra headers
func (c *Context) RespondWithReader(statusCode int, contentLength int, contentType string,
body io.Reader, extraHeaders map[string]string) {
c.Writer.Header().Set(HeaderContentType, contentType)
c.Writer.Header().Set(HeaderContentLength, strconv.Itoa(contentLength))
for key, val := range extraHeaders {
c.Writer.Header().Set(key, val)
}
c.Writer.WriteHeader(statusCode)
c.wroteHeader = true
io.CopyN(c.Writer, body, int64(contentLength))
}
// Stream sends a streaming response and returns a boolean
// indicates "Is client disconnected in middle of stream"
func (c *Context) Stream(step func(w io.Writer) bool) bool {
w := c.Writer
clientGone := c.Request.Context().Done()
for {
select {
case <-clientGone:
return true
default:
keepOpen := step(w)
c.Flush()
if !keepOpen {
return false
}
}
}
}
// Set is used to store a new key/value pair exclusively for this context.
func (c *Context) Set(key string, val interface{}) {
c.mu.Lock()
if c.Meta == nil {
c.Meta = make(map[string]interface{})
}
c.Meta[key] = val
c.mu.Unlock()
}
// Get returns the value for the given key,
// If the value does not exists it returns (nil, false).
func (c *Context) Get(key string) (val interface{}, exists bool) {
c.mu.RLock()
val, exists = c.Meta[key]
c.mu.RUnlock()
return
}
// RemoteIP parses the IP from Request.RemoteAddr, returns the net.IP (without the port).
func (c *Context) RemoteIP() (net.IP, bool) {
ip, _, err := net.SplitHostPort(strings.TrimSpace(c.Request.RemoteAddr))
if err != nil {
return nil, false
}
remoteIP := net.ParseIP(ip)
if remoteIP == nil {
return nil, false
}
return remoteIP, true
}
// Hijack implements the http.Hijacker interface.
func (c *Context) Hijack() (net.Conn, *bufio.ReadWriter, error) {
c.wroteHeader = true
return c.Writer.(http.Hijacker).Hijack()
}
// Flush implements the http.Flush interface.
func (c *Context) Flush() {
c.Writer.(http.Flusher).Flush()
}
// Pusher implements the http.Pusher interface.
func (c *Context) Pusher() http.Pusher {
if pusher, ok := c.Writer.(http.Pusher); ok {
return pusher
}
return nil
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"context"
"encoding/json"
"errors"
"net/http"
"strconv"
"syscall"
)
type (
// Error implements HTTPError
Error struct {
Status int // http status code
Code string // error code
Err error // error
}
// errorResponse response error with json
// internal type between server and client
errorResponse struct {
Error string `json:"error"`
Code string `json:"code,omitempty"`
}
statusCoder interface {
StatusCode() int
}
errorCoder interface {
ErrorCode() string
}
)
var _ HTTPError = &Error{}
// NewError new error with
func NewError(statusCode int, errCode string, err error) *Error {
return &Error{
Status: statusCode,
Code: errCode,
Err: err,
}
}
// StatusCode returns http status code
func (e *Error) StatusCode() int {
return e.Status
}
// ErrorCode returns special defined code
func (e *Error) ErrorCode() string {
return e.Code
}
// Error implements error
func (e *Error) Error() string {
if e.Err == nil {
return ""
}
return e.Err.Error()
}
// Unwrap errors.Is(), errors.As() and errors.Unwrap()
func (e *Error) Unwrap() error {
return e.Err
}
// DetectStatusCode returns http status code
func DetectStatusCode(err error) int {
if err == nil {
return http.StatusOK
}
var st statusCoder
if errors.As(err, &st) {
return st.StatusCode()
}
switch err {
case syscall.EINVAL:
return http.StatusBadRequest
case context.Canceled:
return 499
default:
return http.StatusInternalServerError
}
}
// DetectErrorCode returns error code
func DetectErrorCode(err error) string {
if err == nil {
return ""
}
var ec errorCoder
if errors.As(err, &ec) {
return ec.ErrorCode()
}
switch err {
case syscall.EINVAL:
return "BadRequest"
case context.Canceled:
return "Canceled"
default:
return "InternalServerError"
}
}
// DetectError returns status code, error code, error
func DetectError(err error) (int, string, error) {
return DetectStatusCode(err), DetectErrorCode(err), errors.Unwrap(err)
}
// Error2HTTPError returns an interface HTTPError from an error
func Error2HTTPError(err error) HTTPError {
if err == nil {
return nil
}
if httpErr, ok := err.(HTTPError); ok {
return httpErr
}
status, code, _ := DetectError(err)
return NewError(status, code, err)
}
// ReplyErr directly reply error with response writer
func ReplyErr(w http.ResponseWriter, code int, err string) {
msg, _ := json.Marshal(NewError(code, "", errors.New(err)))
h := w.Header()
h.Set("Content-Length", strconv.Itoa(len(msg)))
h.Set("Content-Type", MIMEJSON)
w.WriteHeader(code)
w.Write(msg)
}
// ReplyWith directly reply body with response writer
func ReplyWith(w http.ResponseWriter, code int, bodyType string, msg []byte) {
h := w.Header()
h.Set("Content-Length", strconv.Itoa(len(msg)))
h.Set("Content-Type", bodyType)
w.WriteHeader(code)
w.Write(msg)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"context"
"errors"
"fmt"
"net/http"
urllib "net/url"
"strings"
"github.com/cubefs/cubefs/blobstore/common/trace"
)
var errNoHost = errors.New("no host available")
// LbConfig load balance config
type LbConfig struct {
// hosts
Hosts []string `json:"hosts"`
// backup hosts
BackupHosts []string `json:"backup_hosts"`
// HostTryTimes Number of host failure retries, HostTryTimes < RequestTryTimes,
// Avoid requesting the unavailable host all the time
HostTryTimes int `json:"host_try_times"`
// Failure retry interval, default value is -1, if FailRetryIntervalS < 0,
// remove failed hosts will not work.
FailRetryIntervalS int `json:"fail_retry_interval_s"`
// Within MaxFailsPeriodS, if the number of failures is greater than or equal
// to MaxFails, the host is considered disconnected.
MaxFailsPeriodS int `json:"max_fails_period_s"`
// RequestTryTimes The maximum number of attempts for a request hosts.
RequestTryTimes int `json:"try_times"`
// should retry function
ShouldRetry func(code int, err error) bool `json:"-"`
// config for simple client
Config
}
type lbClient struct {
requestTryTimes int
// host for simple client
clientMap map[string]Client
sel Selector
cfg *LbConfig
}
var _ Client = (*lbClient)(nil)
// NewLbClient returns a lb client
func NewLbClient(cfg *LbConfig, sel Selector) Client {
if cfg == nil {
cfg = &LbConfig{}
}
cfg.Config.Tc = cfg.Config.Tc.Default()
if cfg.HostTryTimes == 0 {
cfg.HostTryTimes = (len(cfg.Hosts) + len(cfg.BackupHosts)) * 2
}
if cfg.MaxFailsPeriodS == 0 {
cfg.MaxFailsPeriodS = 1
}
if cfg.RequestTryTimes == 0 {
cfg.RequestTryTimes = cfg.HostTryTimes + 1
}
if cfg.ShouldRetry == nil {
cfg.ShouldRetry = defaultShouldRetry
}
if cfg.HostTryTimes > cfg.RequestTryTimes {
cfg.HostTryTimes = cfg.RequestTryTimes - 1
}
if cfg.FailRetryIntervalS == 0 {
cfg.FailRetryIntervalS = -1
}
if sel == nil {
sel = newSelector(cfg)
}
cl := &lbClient{sel: sel, cfg: cfg}
cl.clientMap = make(map[string]Client)
for _, host := range cfg.Hosts {
cl.clientMap[host] = NewClient(&cfg.Config)
}
for _, host := range cfg.BackupHosts {
cl.clientMap[host] = NewClient(&cfg.Config)
}
cl.requestTryTimes = cfg.RequestTryTimes
return cl
}
var defaultShouldRetry = func(code int, err error) bool {
if err != nil || (code/100 != 4 && code/100 != 2) {
return true
}
return false
}
func (c *lbClient) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
return c.doCtx(ctx, req)
}
func (c *lbClient) Form(ctx context.Context, method, url string, form map[string][]string) (resp *http.Response, err error) {
body := urllib.Values(form).Encode()
req, err := http.NewRequest(method, url, strings.NewReader(body))
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *lbClient) Put(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
body, err := marshalObj(params)
if err != nil {
return
}
request, err := http.NewRequest(http.MethodPut, url, body.Body)
if err != nil {
return
}
request.Header.Set(HeaderContentType, body.ContentType)
return c.Do(ctx, request)
}
func (c *lbClient) Post(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
body, err := marshalObj(params)
if err != nil {
return nil, err
}
request, err := http.NewRequest(http.MethodPost, url, body.Body)
if err != nil {
return nil, err
}
request.Header.Set(HeaderContentType, body.ContentType)
return c.Do(ctx, request)
}
func (c *lbClient) DoWith(ctx context.Context, req *http.Request, ret interface{}, opts ...Option) error {
for _, opt := range opts {
opt(req)
}
resp, err := c.Do(ctx, req)
if err != nil {
return err
}
defer resp.Body.Close()
err = serverCrcEncodeCheck(ctx, req, resp)
if err != nil {
return err
}
return ParseData(resp, ret)
}
func (c *lbClient) GetWith(ctx context.Context, url string, ret interface{}) error {
resp, err := c.Get(ctx, url)
if err != nil {
return err
}
return parseData(resp, ret)
}
func (c *lbClient) PutWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) (err error) {
body, err := marshalObj(params)
if err != nil {
return
}
request, err := http.NewRequest(http.MethodPut, url, body.Body)
if err != nil {
return
}
request.Header.Set(HeaderContentType, body.ContentType)
for _, opt := range opts {
opt(request)
}
resp, err := c.Do(ctx, request)
if err != nil {
return
}
defer resp.Body.Close()
err = serverCrcEncodeCheck(ctx, request, resp)
if err != nil {
return err
}
return ParseData(resp, ret)
}
func (c *lbClient) PostWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error {
body, err := marshalObj(params)
if err != nil {
return err
}
request, err := http.NewRequest(http.MethodPost, url, body.Body)
if err != nil {
return err
}
request.Header.Set(HeaderContentType, body.ContentType)
for _, opt := range opts {
opt(request)
}
resp, err := c.Do(ctx, request)
if err != nil {
return err
}
defer resp.Body.Close()
// set Header and log errors
err = serverCrcEncodeCheck(ctx, request, resp)
if err != nil {
return err
}
return ParseData(resp, ret)
}
func (c *lbClient) Head(ctx context.Context, url string) (resp *http.Response, err error) {
req, err := http.NewRequest(http.MethodHead, url, nil)
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *lbClient) Get(ctx context.Context, url string) (resp *http.Response, err error) {
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *lbClient) Delete(ctx context.Context, url string) (resp *http.Response, err error) {
req, err := http.NewRequest(http.MethodDelete, url, nil)
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *lbClient) doCtx(ctx context.Context, r *http.Request) (resp *http.Response, err error) {
reqURI := r.URL.RequestURI()
span := trace.SpanFromContextSafe(ctx)
span.Debug("lb.doCtx: start", reqURI)
var (
hosts []string
tryTimes = c.requestTryTimes
index = 0
)
for i := 0; i < tryTimes; i++ {
// close failed body
if resp != nil && resp.Body != nil {
resp.Body.Close()
resp = nil
}
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
// get the available hosts
if index == len(hosts) || hosts == nil {
hosts = c.sel.GetAvailableHosts()
if len(hosts) < 1 {
err = errNoHost
span.Errorf("lb.doCtx: get host failed: %s", err.Error())
return
}
index = 0
}
host := hosts[index]
// get the real url
r.URL, err = urllib.Parse(host + reqURI)
if err != nil {
span.Errorf("lb.doCtx: parse %s error", host+reqURI)
return
}
r.Host = r.URL.Host
resp, err = c.clientMap[host].Do(ctx, r)
if i == tryTimes-1 {
span.Warnf("lb.doCtx: the last host of request, try times: %d, err: %v, host: %s",
i+1, err, host)
return
}
code := 0
if resp != nil {
code = resp.StatusCode
}
logInfo := fmt.Sprintf("try times: %d, code: %d, err: %v, host: %s", i+1, code, err, host)
if c.cfg.ShouldRetry(code, err) {
span.Info("lb.doCtx: retry host,", logInfo)
index++
c.sel.SetFail(host)
if r.Body == nil {
continue
}
if r.GetBody != nil {
var _err error
r.Body, _err = r.GetBody()
if _err != nil {
span.Warnf("lb.doCtx: retry failed, try times: %d, code: %d, err: %v, host: %s",
i+1, code, _err, host)
return
}
continue
}
span.Warn("lb.doCtx: request not support retry,", logInfo)
return
}
span.Debug("lb.doCtx: the last host of request,", logInfo)
return
}
return
}
func (c *lbClient) Close() {
c.sel.Close()
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import "net/http"
// MiddlewareHandler middleware above rpc server default router.
// Run sorted by progress handler order.
func MiddlewareHandler(phs ...ProgressHandler) http.Handler {
DefaultRouter.hasMiddleware = true
phs = append(DefaultRouter.headMiddlewares, phs...)
return buildHTTPHandler(DefaultRouter.ServeHTTP, phs...)
}
// MiddlewareHandlerFunc middleware func above rpc server default router.
// Run sorted by progress handler order.
func MiddlewareHandlerFunc(phs ...ProgressHandler) http.HandlerFunc {
DefaultRouter.hasMiddleware = true
phs = append(DefaultRouter.headMiddlewares, phs...)
return buildHTTPHandler(DefaultRouter.ServeHTTP, phs...)
}
// MiddlewareHandlerWith middleware above rpc server router
// Run sorted by progress handler order.
func MiddlewareHandlerWith(r *Router, phs ...ProgressHandler) http.Handler {
r.hasMiddleware = true
phs = append(r.headMiddlewares, phs...)
return buildHTTPHandler(r.ServeHTTP, phs...)
}
// MiddlewareHandlerFuncWith middleware func above rpc server router
// Run sorted by progress handler order.
func MiddlewareHandlerFuncWith(r *Router, phs ...ProgressHandler) http.HandlerFunc {
r.hasMiddleware = true
phs = append(r.headMiddlewares, phs...)
return buildHTTPHandler(r.ServeHTTP, phs...)
}
func buildHTTPHandler(h http.HandlerFunc, phs ...ProgressHandler) http.HandlerFunc {
if len(phs) == 0 {
return h
}
last := len(phs) - 1
return buildHTTPHandler(func(w http.ResponseWriter, req *http.Request) {
phs[last].Handler(w, req, h)
}, phs[:last]...)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"bytes"
"crypto/md5"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"os"
"path"
"runtime"
"strings"
"github.com/cubefs/cubefs/blobstore/util/version"
)
// headers
const (
HeaderContentType = "Content-Type"
HeaderContentLength = "Content-Length"
HeaderContentRange = "Content-Range"
HeaderContentMD5 = "Content-MD5"
HeaderUA = "User-Agent"
// trace
HeaderTraceLog = "Trace-Log"
HeaderTraceTags = "Trace-Tags"
// crc checker
HeaderCrcEncoded = "X-Crc-Encoded"
HeaderAckCrcEncoded = "X-Ack-Crc-Encoded"
)
// mime
const (
MIMEStream = "application/octet-stream"
MIMEJSON = "application/json"
MIMEXML = "application/xml"
MIMEPlain = "text/plain"
MIMEPOSTForm = "application/x-www-form-urlencoded"
MIMEMultipartPOSTForm = "multipart/form-data"
MIMEYAML = "application/x-yaml"
)
// encoding
const (
GzipEncodingType = "gzip"
)
// UserAgent user agent
var UserAgent = "Golang blobstore/rpc package"
type (
// ValueGetter fill argument's field from url values or http params.
ValueGetter func(string) string
// Parser is the interface implemented by argument types
// that can parse themselves from url.Values.
Parser interface {
Parse(ValueGetter) error
}
// priority of marshaler and unmarshaler (default is json).
// - - - - - - - - - - - - - - - - - - - - - -
// | | marshaler | unmarshaler |
// | higher |
// | ^ | MarshalerTo | UnmarshalerFrom |
// | | | Marshaler | Unmarshaler |
// | | | JSON Marshal| JSON Unmarshal |
// | lower |
// - - - - - - - - - - - - - - - - - - - - - -
// Actions on RPC.
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | APP | Client | TCP | Server |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// | Request Type | marshaler | - - - - > | unmarshaler |
// | | |
// | \/ |
// | Response Type | unmarshaler | < - - - - | marshaler |
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
// Marshaler is the interface implemented by types that
// can marshal themselves into bytes, second parameter
// is content type.
Marshaler interface {
Marshal() ([]byte, string, error)
}
// MarshalerTo is the interface implemented by types that
// can marshal themselves into writer, the first parameter
// is content type. (Not Recommended).
// The underlying writer is a *bytes.Buffer.
// Context.RespondWithReader is better than MarshalerTo on Server Side.
MarshalerTo interface {
MarshalTo(responseBody io.Writer) (string, error)
}
// Unmarshaler is the interface implemented by types
// that can unmarshal themselves from bytes.
Unmarshaler interface {
Unmarshal([]byte) error
}
// UnmarshalerFrom is the interface implemented by types
// that can unmarshal themselves from body reader.
// The body underlying implementation is a *io.LimitedReader.
UnmarshalerFrom interface {
UnmarshalFrom(requestBody io.Reader) error
}
// HTTPError interface of error with http status code
HTTPError interface {
// StatusCode http status code
StatusCode() int
// ErrorCode special defined code
ErrorCode() string
// Error detail message
Error() string
}
)
// ProgressHandler http progress handler
type ProgressHandler interface {
Handler(http.ResponseWriter, *http.Request, func(http.ResponseWriter, *http.Request))
}
// NoneBody no body of request of response.
var NoneBody Marshaler = noneBody{}
type noneBody struct{}
func (noneBody) Marshal() ([]byte, string, error) {
return []byte{}, "", nil
}
type marshalledBody struct {
ContentLength int
ContentType string
Body io.Reader
}
func marshalObj(obj interface{}) (*marshalledBody, error) {
var (
buffer []byte
ct string = MIMEJSON
err error
)
if obj == nil {
buffer = jsonNull[:]
} else if o, ok := obj.(MarshalerTo); ok {
w := bytes.NewBuffer(nil)
ct, err = o.MarshalTo(w)
if err != nil {
return nil, err
}
return &marshalledBody{
ContentLength: w.Len(),
ContentType: ct,
Body: w,
}, nil
} else if o, ok := obj.(Marshaler); ok {
buffer, ct, err = o.Marshal()
} else {
buffer, err = json.Marshal(obj)
}
if err != nil {
return nil, err
}
return &marshalledBody{
ContentLength: len(buffer),
ContentType: ct,
Body: bytes.NewReader(buffer),
}, nil
}
func programVersion() string {
sp := strings.Fields(strings.TrimSpace(version.Version()))
if len(sp) == 0 || sp[0] == "develop" {
data, err := ioutil.ReadFile(os.Args[0])
if err != nil {
return "_"
}
return fmt.Sprintf("%x", md5.Sum(data))[:10]
}
if len(sp) > 10 {
return sp[0][:10]
}
return sp[0]
}
func init() {
hostname, _ := os.Hostname()
ua := fmt.Sprintf("%s/%s (%s/%s; %s) %s/%s",
path.Base(os.Args[0]),
programVersion(),
runtime.GOOS,
runtime.GOARCH,
runtime.Version(),
hostname,
fmt.Sprint(os.Getpid()),
)
if UserAgent != ua {
UserAgent = ua
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"bytes"
"fmt"
"net"
"net/http"
"os"
"runtime"
"strings"
"github.com/cubefs/cubefs/blobstore/util/log"
)
// defaultRecovery logging panic info, then panic to next handler
func defaultRecovery(w http.ResponseWriter, req *http.Request, err interface{}) {
var brokenPipe bool
if ne, ok := err.(*net.OpError); ok {
if se, ok := ne.Err.(*os.SyscallError); ok {
if strings.Contains(strings.ToLower(se.Error()), "broken pipe") ||
strings.Contains(strings.ToLower(se.Error()), "connection reset by peer") {
brokenPipe = true
}
}
}
stack := stack(3)
if brokenPipe {
log.Warnf("handle panic: %s on broken pipe\n%s", err, stack)
} else {
log.Errorf("handle panic: %s\n%s", err, stack)
panic(err)
}
}
func stack(skip int) []byte {
buf := new(bytes.Buffer)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)
if !ok {
break
}
fmt.Fprintf(buf, "%s:%d (0x%x:%s)\n", file, line, pc, funcname(pc))
}
return buf.Bytes()
}
// funcname returns the name of the function
func funcname(pc uintptr) []byte {
fn := runtime.FuncForPC(pc)
if fn == nil {
return []byte("???")
}
name := []byte(fn.Name())
if last := bytes.LastIndex(name, []byte("/")); last >= 0 {
name = name[last+1:]
}
if first := bytes.Index(name, []byte(".")); first >= 0 {
name = name[first+1:]
}
return name
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"net/http"
"reflect"
"runtime"
"github.com/julienschmidt/httprouter"
"github.com/cubefs/cubefs/blobstore/util/log"
)
type (
// Router router with interceptors
// Interceptor is middleware for http serve but named `interceptor`.
// Middleware within this Router called `interceptor`.
//
// headMiddlewares is middlewares run firstly.
// Running order is:
// headMiddlewares --> middlewares --> interceptors --> handler
//
// example:
// router := New()
// router.Use(interceptor1, interceptor2)
// router.Handle(http.MethodGet, "/get/:name", handlerGet)
// router.Handle(http.MethodPut, "/put/:name", handlerPut)
Router struct {
Router *httprouter.Router // router
hasMiddleware bool // true if the router with Middleware*
headMiddlewares []ProgressHandler // middlewares run firstly of all
headHandler http.HandlerFunc // run this handler if has no middlewares
interceptors []HandlerFunc // interceptors after middlewares
}
)
// ServeHTTP makes the router implement the http.Handler interface.
func (r *Router) ServeHTTP(w http.ResponseWriter, req *http.Request) {
if !r.hasMiddleware {
r.headHandler(w, req)
return
}
r.Router.ServeHTTP(w, req)
}
// DefaultRouter default router for server
var DefaultRouter *Router
func init() {
initDefaultRouter()
}
func initDefaultRouter() {
DefaultRouter = New()
DefaultRouter.Router.PanicHandler = defaultRecovery
}
// New alias of httprouter.New
// Return a Router, control by yourself
func New() *Router {
r := &Router{
Router: httprouter.New(),
hasMiddleware: false,
headMiddlewares: []ProgressHandler{&crcDecoder{}},
}
r.headHandler = buildHTTPHandler(r.Router.ServeHTTP, r.headMiddlewares...)
return r
}
// Use attaches a global interceptor to the router.
// You should Use interceptor before register handler.
// It is sorted by registered order.
func (r *Router) Use(interceptors ...HandlerFunc) {
if len(r.interceptors)+len(interceptors) >= int(abortIndex) {
panic("too many regiter handlers")
}
r.interceptors = append(r.interceptors, interceptors...)
}
// Handle registers a new request handle with the given path and method.
//
// For HEAD, GET, POST, PUT, PATCH and DELETE requests the respective shortcut
// functions can be used.
func (r *Router) Handle(method, path string, handler HandlerFunc, opts ...ServerOption) {
// Notice: in golang, sentence [ sliceA := append(sliceA, item) ]
// the pointer of sliceA is the pointer of sliceB, if sliceB has enough capacity.
// so we need make a new slice.
handlers := make([]HandlerFunc, 0, len(r.interceptors)+1)
handlers = append(handlers, r.interceptors...)
handlers = append(handlers, handler)
if len(handlers) >= int(abortIndex) {
panic("too many regiter handlers")
}
r.Router.Handle(method, path, makeHandler(handlers, opts...))
opt := new(serverOptions)
for _, o := range opts {
o.apply(opt)
}
icnames := make([]string, 0, len(r.interceptors))
for _, ic := range r.interceptors {
icnames = append(icnames, runtime.FuncForPC(reflect.ValueOf(ic).Pointer()).Name())
}
name := runtime.FuncForPC(reflect.ValueOf(handler).Pointer()).Name()
log.Infof("register handler method:%s, path:%s, interceptors:%s, handler:%s, opts:%+v",
method, path, icnames, name, opt)
}
// Use attaches a global interceptor to the default router.
// You should Use interceptor before register handler.
// It is sorted by registered order.
func Use(interceptors ...HandlerFunc) {
DefaultRouter.interceptors = append(DefaultRouter.interceptors, interceptors...)
}
// HEAD is a shortcut for Handle(http.MethodHead, path, handle)
func HEAD(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodHead, path, handler, opts...)
}
// GET is a shortcut for Handle(http.MethodGet, path, handle)
func GET(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodGet, path, handler, opts...)
}
// POST is a shortcut for Handle(http.MethodPost, path, handle)
func POST(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodPost, path, handler, opts...)
}
// PUT is a shortcut for Handle(http.MethodPut, path, handle)
func PUT(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodPut, path, handler, opts...)
}
// DELETE is a shortcut for Handle(http.MethodDelete, path, handle)
func DELETE(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodDelete, path, handler, opts...)
}
// OPTIONS is a shortcut for Handle(http.MethodOptions, path, handle)
func OPTIONS(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodOptions, path, handler, opts...)
}
// PATCH is a shortcut for Handle(http.MethodPatch, path, handle)
func PATCH(path string, handler HandlerFunc, opts ...ServerOption) {
Handle(http.MethodPatch, path, handler, opts...)
}
// Handle registers a new request handle with the given path and method.
//
// For HEAD, GET, POST, PUT, PATCH and DELETE requests the respective shortcut
// functions can be used.
func Handle(method, path string, handler HandlerFunc, opts ...ServerOption) {
DefaultRouter.Handle(method, path, handler, opts...)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"context"
"math/rand"
"sync"
"time"
)
// hostItem Host information
type hostItem struct {
rawHost string
// The last time the host failed
lastFailedTime int64
// The number of times the host can retry
retryTimes int
// Whether the host is a backup host
isBackup bool
sync.RWMutex
}
type Selector interface {
// GetAvailableHosts get the available hosts
GetAvailableHosts() []string
// SetFail Mark host unavailable
SetFail(string)
// Close If use a background coroutine to enable the broken host, you can use it to close the coroutine
Close()
}
// allocate hostItem to request
type selector struct {
// normal hosts
hosts []*hostItem
// broken hosts
crackHosts map[*hostItem]interface{}
// backup hosts
backupHost []*hostItem
hostMap map[string]*hostItem
// the frequency for a host to retry, if retryTimes > hostTryTimes the host will be marked as failed
hostTryTimes int
// retry interval after host failure, after this time, the host can be remarked as available
failRetryIntervalS int
// the interval for a host can fail one time, the lastFailedTime will be set to current time if exceeded
maxFailsPeriodS int
sync.RWMutex
cancelDetectionGoroutine context.CancelFunc
}
func newSelector(cfg *LbConfig) Selector {
ctx, cancelFunc := context.WithCancel(context.Background())
rand.Seed(time.Now().UnixNano())
s := &selector{
hosts: initHost(cfg.Hosts, cfg, false),
backupHost: initHost(cfg.BackupHosts, cfg, true),
hostTryTimes: cfg.HostTryTimes,
failRetryIntervalS: cfg.FailRetryIntervalS,
crackHosts: map[*hostItem]interface{}{},
hostMap: map[string]*hostItem{},
cancelDetectionGoroutine: cancelFunc,
maxFailsPeriodS: cfg.MaxFailsPeriodS,
}
s.initHostMap()
if cfg.FailRetryIntervalS < 0 {
return s
}
go func() {
s.detectAvailableHostInBack()
ticker := time.NewTicker(time.Duration(s.failRetryIntervalS) * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
s.detectAvailableHostInBack()
case <-ctx.Done():
return
}
}
}()
return s
}
// GetAvailableHosts return available hosts from hosts and backupHost
func (s *selector) GetAvailableHosts() (hosts []string) {
s.RLock()
hostLen := len(s.hosts)
length := len(s.hosts) + len(s.backupHost)
hosts = make([]string, length)
for index, host := range s.hosts {
hosts[index] = host.rawHost
}
for index, host := range s.backupHost {
hosts[index+hostLen] = host.rawHost
}
s.RUnlock()
randomShuffle(hosts, hostLen)
return
}
// SetFail update the requestFailedRetryTimes of hostItem or disable the host
func (s *selector) SetFail(host string) {
if s.failRetryIntervalS < 0 {
return
}
item := s.hostMap[host]
item.Lock()
now := time.Now().Unix()
// init last failed time
if item.lastFailedTime == 0 {
item.lastFailedTime = now
}
// update last failed time
if now-item.lastFailedTime >= int64(s.maxFailsPeriodS) {
item.retryTimes = s.hostTryTimes
item.lastFailedTime = now
}
item.retryTimes -= 1
if item.retryTimes > 0 {
item.Unlock()
return
}
item.Unlock()
s.disableHost(item)
}
// detectAvailableHostInBack enable the host from crackHosts
func (s *selector) detectAvailableHostInBack() {
var cache []*hostItem
s.RLock()
for key := range s.crackHosts {
cache = append(cache, key)
}
s.RUnlock()
for _, hItem := range cache {
hItem.Lock()
now := time.Now().Unix()
if now-hItem.lastFailedTime >= int64(s.failRetryIntervalS) {
hItem.retryTimes = s.hostTryTimes
hItem.lastFailedTime = 0
hItem.Unlock()
s.enableHost(hItem)
continue
}
hItem.Unlock()
}
}
func initHost(hosts []string, cfg *LbConfig, isBackup bool) (hs []*hostItem) {
for _, host := range hosts {
hs = append(hs, &hostItem{
retryTimes: cfg.HostTryTimes,
rawHost: host,
isBackup: isBackup,
})
}
return
}
func (s *selector) initHostMap() {
for _, item := range s.hosts {
s.hostMap[item.rawHost] = item
}
for _, item := range s.backupHost {
s.hostMap[item.rawHost] = item
}
}
// mess up the order of hosts to load balancing
func randomShuffle(hosts []string, length int) {
for i := length; i > 0; i-- {
lastIdx := i - 1
idx := rand.Intn(i)
hosts[lastIdx], hosts[idx] = hosts[idx], hosts[lastIdx]
}
for i := len(hosts); i > length; i-- {
lastIdx := i - 1
idx := rand.Intn(i-length) + length
hosts[lastIdx], hosts[idx] = hosts[idx], hosts[lastIdx]
}
}
// add unavailable host from hosts or backupHost into crackHosts
func (s *selector) disableHost(item *hostItem) {
s.Lock()
defer s.Unlock()
s.crackHosts[item] = struct{}{}
index := 0
var temp *[]*hostItem
if item.isBackup {
temp = &s.backupHost
} else {
temp = &s.hosts
}
for ; index < len(*temp); index++ {
if item == (*temp)[index] {
if index == len(*temp)-1 {
*temp = (*temp)[:index]
return
}
*temp = append((*temp)[:index], (*temp)[index+1:]...)
return
}
}
}
// enableHost add available host from crackHosts into backupHost or hosts
func (s *selector) enableHost(hItem *hostItem) {
s.Lock()
defer s.Unlock()
delete(s.crackHosts, hItem)
if hItem.isBackup {
s.backupHost = append(s.backupHost, hItem)
return
}
s.hosts = append(s.hosts, hItem)
}
func (s *selector) Close() {
s.cancelDetectionGoroutine()
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"net/http"
"github.com/julienschmidt/httprouter"
)
type (
// HandlerFunc defines the handler of app function
HandlerFunc func(*Context)
// ServerOption server option applier
// Order: if args in body ignore others options,
// else uri > query > form > postfrom
ServerOption interface {
apply(*serverOptions)
}
serverOptions struct {
argsBody bool
argsURI bool
argsQuery bool
argsForm bool
argsPostForm bool
metaCapacity int
}
funcServerOption struct {
f func(*serverOptions)
}
)
func (so *serverOptions) copy() *serverOptions {
return &serverOptions{
argsBody: so.argsBody,
argsURI: so.argsURI,
argsQuery: so.argsQuery,
argsForm: so.argsForm,
argsPostForm: so.argsPostForm,
metaCapacity: so.metaCapacity,
}
}
func (so *serverOptions) hasArgs() bool {
return so.argsBody || so.argsURI || so.argsQuery || so.argsForm || so.argsPostForm
}
func (fo *funcServerOption) apply(f *serverOptions) {
fo.f(f)
}
func newFuncServerOption(f func(*serverOptions)) *funcServerOption {
return &funcServerOption{
f: f,
}
}
// OptArgsBody argument in request body
func OptArgsBody() ServerOption {
return newFuncServerOption(func(o *serverOptions) {
o.argsBody = true
})
}
// OptArgsURI argument in uri
func OptArgsURI() ServerOption {
return newFuncServerOption(func(o *serverOptions) {
o.argsURI = true
})
}
// OptArgsQuery argument in query string
func OptArgsQuery() ServerOption {
return newFuncServerOption(func(o *serverOptions) {
o.argsQuery = true
})
}
// OptArgsForm argument in form
func OptArgsForm() ServerOption {
return newFuncServerOption(func(o *serverOptions) {
o.argsForm = true
})
}
// OptArgsPostForm argument in post form
func OptArgsPostForm() ServerOption {
return newFuncServerOption(func(o *serverOptions) {
o.argsPostForm = true
})
}
// OptMetaCapacity initial meta capacity
func OptMetaCapacity(capacity int) ServerOption {
return newFuncServerOption(func(o *serverOptions) {
if capacity >= 0 {
o.metaCapacity = capacity
}
})
}
// makeHandler make handle of httprouter
func makeHandler(handlers []HandlerFunc, opts ...ServerOption) httprouter.Handle {
opt := new(serverOptions)
for _, o := range opts {
o.apply(opt)
}
return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
c := &Context{
opts: opt,
Param: ps,
Request: r,
Writer: w,
Meta: make(map[string]interface{}, opt.metaCapacity),
index: -1,
handlers: handlers,
}
c.Next()
if !c.wroteHeader {
c.RespondStatus(http.StatusOK)
}
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package rpc
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
urllib "net/url"
"strings"
"time"
"github.com/cubefs/cubefs/blobstore/common/crc32block"
"github.com/cubefs/cubefs/blobstore/common/trace"
"github.com/cubefs/cubefs/blobstore/util/bytespool"
"github.com/cubefs/cubefs/blobstore/util/errors"
)
// Config simple client config
type Config struct {
// the whole request and response timeout
ClientTimeoutMs int64 `json:"client_timeout_ms"`
// bandwidthBPMs for read body
BodyBandwidthMBPs float64 `json:"body_bandwidth_mbps"`
// base timeout for read body
BodyBaseTimeoutMs int64 `json:"body_base_timeout_ms"`
// transport config
Tc TransportConfig `json:"transport_config"`
}
// ErrBodyReadTimeout timeout error
var ErrBodyReadTimeout = errors.New("read body timeout")
// Option client options
type Option func(req *http.Request)
// WithCrcEncode request with crc32 encode
func WithCrcEncode() Option {
return func(req *http.Request) {
req.Header.Set(HeaderCrcEncoded, "1")
// util not support reader = nil
if req.ContentLength > 0 && req.Body != nil {
encoder := crc32block.NewBodyEncoder(req.Body)
req.Body = encoder
if bodyGetter := req.GetBody; bodyGetter != nil {
req.GetBody = func() (io.ReadCloser, error) {
body, err := bodyGetter()
return crc32block.NewBodyEncoder(body), err
}
}
req.ContentLength = encoder.CodeSize(req.ContentLength)
}
}
}
// Client implements the rpc client with http
type Client interface {
// Method*** handle response by yourself
Do(ctx context.Context, req *http.Request) (*http.Response, error)
Head(ctx context.Context, url string) (*http.Response, error)
Get(ctx context.Context, url string) (*http.Response, error)
Delete(ctx context.Context, url string) (*http.Response, error)
Form(ctx context.Context, method, url string, form map[string][]string) (*http.Response, error)
Put(ctx context.Context, url string, params interface{}) (*http.Response, error)
Post(ctx context.Context, url string, params interface{}) (*http.Response, error)
// ***With means parse result in client
DoWith(ctx context.Context, req *http.Request, ret interface{}, opts ...Option) error
GetWith(ctx context.Context, url string, ret interface{}) error
PutWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error
PostWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error
// Close background goroutines in lb client
Close()
}
type client struct {
client *http.Client
bandwidthBPMs int64 // using for reading body
bodyBaseTimeoutMs int64 // base time read body
}
// NewClient returns a rpc client
func NewClient(cfg *Config) Client {
if cfg == nil {
cfg = &Config{}
}
cfg.Tc = cfg.Tc.Default()
if cfg.BodyBaseTimeoutMs == 0 {
cfg.BodyBaseTimeoutMs = 30 * 1e3
}
return &client{
client: &http.Client{
Transport: NewTransport(&cfg.Tc),
Timeout: time.Duration(cfg.ClientTimeoutMs) * time.Millisecond,
},
bandwidthBPMs: int64(cfg.BodyBandwidthMBPs * (1 << 20) / 1e3),
bodyBaseTimeoutMs: cfg.BodyBaseTimeoutMs,
}
}
func (c *client) Form(ctx context.Context, method, url string, form map[string][]string) (resp *http.Response, err error) {
body := urllib.Values(form).Encode()
request, err := http.NewRequest(method, url, strings.NewReader(body))
if err != nil {
return
}
return c.Do(ctx, request)
}
func (c *client) Put(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
body, err := marshalObj(params)
if err != nil {
return
}
request, err := http.NewRequest(http.MethodPut, url, body.Body)
if err != nil {
return
}
request.Header.Set(HeaderContentType, body.ContentType)
return c.Do(ctx, request)
}
func (c *client) Post(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
body, err := marshalObj(params)
if err != nil {
return nil, err
}
request, err := http.NewRequest(http.MethodPost, url, body.Body)
if err != nil {
return nil, err
}
request.Header.Set(HeaderContentType, body.ContentType)
return c.Do(ctx, request)
}
func (c *client) DoWith(ctx context.Context, req *http.Request, ret interface{}, opts ...Option) error {
for _, opt := range opts {
opt(req)
}
resp, err := c.Do(ctx, req)
if err != nil {
return err
}
defer resp.Body.Close()
err = serverCrcEncodeCheck(ctx, req, resp)
if err != nil {
return err
}
return ParseData(resp, ret)
}
func (c *client) GetWith(ctx context.Context, url string, ret interface{}) error {
resp, err := c.Get(ctx, url)
if err != nil {
return err
}
return parseData(resp, ret)
}
func (c *client) PutWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) (err error) {
body, err := marshalObj(params)
if err != nil {
return
}
request, err := http.NewRequest(http.MethodPut, url, body.Body)
if err != nil {
return
}
request.Header.Set(HeaderContentType, body.ContentType)
for _, opt := range opts {
opt(request)
}
resp, err := c.Do(ctx, request)
if err != nil {
return
}
defer resp.Body.Close()
err = serverCrcEncodeCheck(ctx, request, resp)
if err != nil {
return err
}
return ParseData(resp, ret)
}
func (c *client) PostWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error {
body, err := marshalObj(params)
if err != nil {
return err
}
request, err := http.NewRequest(http.MethodPost, url, body.Body)
if err != nil {
return err
}
request.Header.Set(HeaderContentType, body.ContentType)
for _, opt := range opts {
opt(request)
}
resp, err := c.Do(ctx, request)
if err != nil {
return err
}
defer resp.Body.Close()
err = serverCrcEncodeCheck(ctx, request, resp)
if err != nil {
return err
}
return ParseData(resp, ret)
}
func (c *client) Head(ctx context.Context, url string) (resp *http.Response, err error) {
req, err := http.NewRequest(http.MethodHead, url, nil)
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *client) Get(ctx context.Context, url string) (resp *http.Response, err error) {
req, err := http.NewRequest(http.MethodGet, url, nil)
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *client) Delete(ctx context.Context, url string) (resp *http.Response, err error) {
req, err := http.NewRequest(http.MethodDelete, url, nil)
if err != nil {
return
}
return c.Do(ctx, req)
}
func (c *client) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
if req.Header.Get(HeaderUA) == "" {
req.Header.Set(HeaderUA, UserAgent)
}
span := trace.SpanFromContextSafe(ctx)
err := trace.InjectWithHTTPHeader(ctx, req)
if err != nil {
span.Errorf("inject failed, %v", err)
}
resp, err := c.doWithCtx(ctx, req)
if err != nil {
return resp, err
}
header := resp.Header
traceLog := header[HeaderTraceLog]
if len(traceLog) > 0 {
span.AppendRPCTrackLog([]string{strings.Join(traceLog, ";")})
}
return resp, err
}
func (c *client) Close() {
// Do nothing to close.
}
func (c *client) doWithCtx(ctx context.Context, req *http.Request) (resp *http.Response, err error) {
span := trace.SpanFromContextSafe(ctx)
req = req.WithContext(ctx)
if c.bandwidthBPMs > 0 && req.Body != nil {
t := req.ContentLength/c.bandwidthBPMs + c.bodyBaseTimeoutMs
req.Body = &timeoutReadCloser{timeoutMs: t, body: req.Body}
}
resp, err = c.client.Do(req)
if err != nil {
span.Warnf("do request to %s failed, error: %s", req.URL, err.Error())
return
}
if c.bandwidthBPMs > 0 {
t := resp.ContentLength/c.bandwidthBPMs + c.bodyBaseTimeoutMs
resp.Body = &timeoutReadCloser{timeoutMs: t, body: resp.Body}
}
return
}
// parseData close response body in this package.
func parseData(resp *http.Response, data interface{}) (err error) {
defer resp.Body.Close()
return ParseData(resp, data)
}
// ParseData parse response with data, close response body by yourself.
func ParseData(resp *http.Response, data interface{}) (err error) {
if resp.StatusCode/100 == 2 {
size := resp.ContentLength
if data != nil && size != 0 {
if d, ok := data.(UnmarshalerFrom); ok {
return d.UnmarshalFrom(io.LimitReader(resp.Body, size))
}
if d, ok := data.(Unmarshaler); ok {
buf := bytespool.Alloc(int(size))
defer bytespool.Free(buf)
if _, err = io.ReadFull(resp.Body, buf); err != nil {
return NewError(resp.StatusCode, "ReadResponse", err)
}
return d.Unmarshal(buf)
}
if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
return NewError(resp.StatusCode, "JSONDecode", err)
}
}
if resp.StatusCode == 200 {
return nil
}
return NewError(resp.StatusCode, "", err)
}
return ParseResponseErr(resp)
}
// ParseResponseErr parse error of response
func ParseResponseErr(resp *http.Response) (err error) {
// wrap the error with HttpError for StatusCode is not 2XX
if resp.StatusCode > 299 && resp.ContentLength != 0 {
errR := &errorResponse{}
if err := json.NewDecoder(resp.Body).Decode(errR); err != nil {
return NewError(resp.StatusCode, resp.Status, nil)
}
err = NewError(resp.StatusCode, errR.Code, errors.New(errR.Error))
return
}
return NewError(resp.StatusCode, resp.Status, nil)
}
type timeoutReadCloser struct {
body io.ReadCloser
timeoutMs int64
}
func (tr *timeoutReadCloser) Close() (err error) {
return tr.body.Close()
}
func (tr *timeoutReadCloser) Read(p []byte) (n int, err error) {
readOk := make(chan struct{})
if tr.timeoutMs > 0 {
startTime := time.Now().UnixNano() / 1e6
after := time.After(time.Millisecond * time.Duration(tr.timeoutMs))
go func() {
n, err = tr.body.Read(p)
close(readOk)
}()
select {
case <-readOk:
// really cost time
tr.timeoutMs = tr.timeoutMs - (time.Now().UnixNano()/1e6 - startTime)
return
case <-after:
tr.body.Close()
return 0, ErrBodyReadTimeout
}
}
tr.body.Close()
return 0, ErrBodyReadTimeout
}
func serverCrcEncodeCheck(ctx context.Context, request *http.Request, resp *http.Response) (err error) {
// set Header and log errors
if request.Header.Get(HeaderCrcEncoded) != "" && resp.Header.Get(HeaderAckCrcEncoded) == "" {
msg := fmt.Sprintf("server do not ack that body has been crc encoded, url:%v", request.URL)
trace.SpanFromContextSafe(ctx).Error(msg)
return NewError(http.StatusNotImplemented, "resp.Status", errors.New(msg))
}
return nil
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package trace
import (
"strconv"
"strings"
"github.com/opentracing/opentracing-go"
)
const (
prefixTracer = "blobstore-tracer-"
tracerFieldCount = 2
)
// Propagate with those keys.
// Define your own key by setting those variables before your application started.
var (
RequestIDKey = "X-Reqid"
PrefixBaggage = "blobstore-baggage-"
FieldKeyTraceID = prefixTracer + "traceid"
FieldKeySpanID = prefixTracer + "spanid"
)
var (
// ErrUnsupportedFormat is the alias of opentracing.ErrUnsupportedFormat.
ErrUnsupportedFormat = opentracing.ErrUnsupportedFormat
// ErrSpanContextNotFound is the alias of opentracing.ErrSpanContextNotFound.
ErrSpanContextNotFound = opentracing.ErrSpanContextNotFound
// ErrInvalidSpanContext is the alias of opentracing.ErrInvalidSpanContext.
ErrInvalidSpanContext = opentracing.ErrInvalidSpanContext
// ErrInvalidCarrier is the alias of opentracing.ErrInvalidCarrier.
ErrInvalidCarrier = opentracing.ErrInvalidCarrier
// ErrSpanContextCorrupted is the alias of opentracing.ErrSpanContextCorrupted.
ErrSpanContextCorrupted = opentracing.ErrSpanContextCorrupted
)
const (
// Binary is the alias of opentracing.Binary.
Binary = opentracing.Binary
// TextMap is the alias of opentracing.TextMap.
TextMap = opentracing.TextMap
// HTTPHeaders is the alias of opentracing.HTTPHeaders.
HTTPHeaders = opentracing.HTTPHeaders
)
// TextMapCarrier is the alias of opentracing.TextMapCarrier.
type TextMapCarrier = opentracing.TextMapCarrier
// HTTPHeadersCarrier is the alias of opentracing.HTTPHeadersCarrier.
type HTTPHeadersCarrier = opentracing.HTTPHeadersCarrier
// TextMapPropagator is a combined Injector and Extractor for TextMap format.
type TextMapPropagator struct{}
var defaultTexMapPropagator = TextMapPropagator{}
// Inject implements Injector of TextMapPropagator
func (t *TextMapPropagator) Inject(sc *SpanContext, carrier interface{}) error {
writer, ok := carrier.(opentracing.TextMapWriter)
if !ok {
return ErrInvalidCarrier
}
writer.Set(FieldKeyTraceID, sc.traceID)
writer.Set(FieldKeySpanID, sc.spanID.String())
sc.ForeachBaggageItems(func(k string, v []string) bool {
if k != internalTrackLogKey { // internal baggage will not inject
writer.Set(PrefixBaggage+k, strings.Join(v, ","))
}
return true
})
return nil
}
// Extract implements Extractor of TextMapPropagator.
func (t *TextMapPropagator) Extract(carrier interface{}) (opentracing.SpanContext, error) {
reader, ok := carrier.(opentracing.TextMapReader)
if !ok {
return nil, ErrInvalidCarrier
}
var (
traceID string
spanID ID
baggage = make(map[string][]string)
fieldCount int
err error
)
err = reader.ForeachKey(func(key, val string) error {
switch strings.ToLower(key) {
case FieldKeyTraceID:
traceID = val
fieldCount++
case FieldKeySpanID:
id, err := strconv.ParseUint(val, 16, 64)
if err != nil {
return ErrSpanContextCorrupted
}
spanID = ID(id)
fieldCount++
default:
lowerKey := strings.ToLower(key)
if strings.HasPrefix(lowerKey, PrefixBaggage) {
k := strings.TrimPrefix(lowerKey, PrefixBaggage)
baggage[k] = append(baggage[k], val)
}
}
return nil
})
if err != nil {
return nil, err
}
if fieldCount == 0 {
return nil, ErrSpanContextNotFound
}
if fieldCount < tracerFieldCount {
return nil, ErrSpanContextCorrupted
}
return &SpanContext{
traceID: traceID,
spanID: spanID,
baggage: baggage,
}, nil
}
// GetTraceIDKey returns http header name of traceid
func GetTraceIDKey() string {
return FieldKeyTraceID
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package trace
import (
"fmt"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/opentracing/opentracing-go"
ptlog "github.com/opentracing/opentracing-go/log"
"github.com/cubefs/cubefs/blobstore/util/log"
)
const (
maxErrorLen = 32
)
// Span extends opentracing.Span
type Span interface {
opentracing.Span
// OperationName allows retrieving current operation name.
OperationName() string
// WithOperation recursively save span with operation.
WithOperation(operation string) Span
// Tags returns tags for span
Tags() Tags
// Logs returns micro logs for span
Logs() []opentracing.LogRecord
// String returns traceID:spanID.
String() string
// TraceID returns traceID
TraceID() string
// AppendRPCTrackLog appends RPC track logs to baggage with default key fieldTrackLogKey.
AppendRPCTrackLog(logs []string)
// AppendTrackLog records cost time with startTime (duration=time.Since(startTime)) for a calling to a module and
// appends to baggage with default key fieldTrackLogKey.
AppendTrackLog(module string, startTime time.Time, err error, opts ...SpanOption)
// AppendTrackLogWithDuration records cost time with duration for a calling to a module and
// appends to baggage with default key fieldTrackLogKey.
AppendTrackLogWithDuration(module string, duration time.Duration, err error, opts ...SpanOption)
// AppendTrackLogWithFunc records cost time for the function calling to a module and
// appends to baggage with default key fieldTrackLogKey.
AppendTrackLogWithFunc(module string, fn func() error, opts ...SpanOption)
// TrackLog returns track log, calls BaggageItem with default key fieldTrackLogKey.
TrackLog() []string
// BaseLogger defines interface of application log apis.
log.BaseLogger
}
// spanImpl implements Span
type spanImpl struct {
operationName string
tracer *Tracer
context *SpanContext
startTime time.Time
duration time.Duration
tags Tags
logs []opentracing.LogRecord
// rootSpan, if true indicate that this span is the root of the (sub)tree
// of spans and parentID is empty.
rootSpan bool
// references for this span
references []opentracing.SpanReference
rw sync.RWMutex
}
// Finish implements opentracing.Span API
func (s *spanImpl) Finish() {
s.FinishWithOptions(opentracing.FinishOptions{})
}
// FinishWithOptions implements opentracing.Span API
func (s *spanImpl) FinishWithOptions(opts opentracing.FinishOptions) {
finishTime := opts.FinishTime
if finishTime.IsZero() {
finishTime = time.Now()
}
s.duration = finishTime.Sub(s.startTime)
s.rw.Lock()
defer s.rw.Unlock()
s.logs = append(s.logs, opts.LogRecords...)
for _, ld := range opts.BulkLogData {
s.logs = append(s.logs, ld.ToLogRecord())
}
// TODO report span
}
// Context implements opentracing.Span API
func (s *spanImpl) Context() opentracing.SpanContext {
s.rw.RLock()
defer s.rw.RUnlock()
return s.context
}
// OperationName returns operationName for span
func (s *spanImpl) OperationName() string {
s.rw.RLock()
defer s.rw.RUnlock()
return s.operationName
}
// SetOperationName implements opentracing.Span API
func (s *spanImpl) SetOperationName(operationName string) opentracing.Span {
s.rw.Lock()
defer s.rw.Unlock()
s.operationName = operationName
return s
}
func (s *spanImpl) WithOperation(operation string) Span {
op := s.OperationName()
if len(op) > 0 {
if len(operation) > 0 {
op = fmt.Sprintf("%s:%s", op, operation)
}
} else {
op = operation
}
return &operationSpan{
Span: s,
operation: op,
}
}
// LogFields implements opentracing.Span API
func (s *spanImpl) LogFields(fields ...ptlog.Field) {
s.rw.Lock()
defer s.rw.Unlock()
s.logs = append(s.logs, opentracing.LogRecord{
Fields: fields,
Timestamp: time.Now(),
})
}
// LogKV implements opentracing.Span API
func (s *spanImpl) LogKV(keyValues ...interface{}) {
fields, err := ptlog.InterleavedKVToFields(keyValues...)
if err != nil {
s.LogFields(ptlog.Error(err), ptlog.String("function", "LogKV"))
return
}
s.LogFields(fields...)
}
// SetBaggageItem implements opentracing.Span API
func (s *spanImpl) SetBaggageItem(key, value string) opentracing.Span {
for _, ref := range s.references {
spanCtx, ok := ref.ReferencedContext.(*SpanContext)
if !ok {
continue
}
spanCtx.setBaggageItem(key, []string{value})
}
s.context.setBaggageItem(key, []string{value})
return s
}
// BaggageItem implements opentracing.Span API
func (s *spanImpl) BaggageItem(key string) string {
return strings.Join(s.context.baggageItem(key), ",")
}
// Tracer implements opentracing.Span API
func (s *spanImpl) Tracer() opentracing.Tracer {
return s.tracer
}
// SetTag implements opentracing.Span API
func (s *spanImpl) SetTag(key string, value interface{}) opentracing.Span {
s.rw.Lock()
defer s.rw.Unlock()
if s.tags == nil {
s.tags = Tags{}
}
s.tags[key] = value
return s
}
// Deprecated: use LogFields or LogKV (not implements)
func (s *spanImpl) LogEvent(event string) {
// Deprecated: explaining why this function is empty.
}
// Deprecated: use LogFields or LogKV (not implements)
func (s *spanImpl) LogEventWithPayload(event string, payload interface{}) {
// Deprecated: explaining why this function is empty.
}
// Deprecated: use LogFields or LogKV (not implements)
func (s *spanImpl) Log(data opentracing.LogData) {
// Deprecated: explaining why this function is empty.
}
// Tags returns tags for span
func (s *spanImpl) Tags() Tags {
s.rw.RLock()
defer s.rw.RUnlock()
// copy
tags := make(map[string]interface{}, len(s.tags))
for key, value := range s.tags {
tags[key] = value
}
return tags
}
// Logs returns micro logs for span
func (s *spanImpl) Logs() []opentracing.LogRecord {
s.rw.RLock()
defer s.rw.RUnlock()
return s.logs
}
// AppendTrackLog records cost time with startTime (duration=time.Since(startTime)) for a calling to a module and
// appends to baggage with default key fieldTrackLogKey.
func (s *spanImpl) AppendTrackLog(module string, startTime time.Time, err error, opts ...SpanOption) {
s.AppendTrackLogWithDuration(module, time.Since(startTime), err, opts...)
}
// AppendTrackLogWithDuration records cost time with duration for a calling to a module and
// appends to baggage with default key fieldTrackLogKey.
func (s *spanImpl) AppendTrackLogWithDuration(module string, duration time.Duration, err error, opts ...SpanOption) {
spanOpt := &spanOptions{duration: durationMs, errorLength: maxErrorLen} // compatibility
for _, opt := range opts {
opt(spanOpt)
}
if spanOpt.duration == durationAny {
module += ":" + duration.String()
} else if dur := spanOpt.duration.Value(duration); dur > 0 {
module += ":" + strconv.FormatInt(dur, 10)
if spanOpt.durationUnit {
module += spanOpt.duration.Unit(duration)
}
}
if err != nil {
msg := err.Error()
errLen := spanOpt.errorLength
if len(msg) > errLen {
msg = msg[:errLen]
}
module += "/" + msg
}
s.track(module)
}
// AppendTrackLogWithFunc records cost time for the function calling to a module.
func (s *spanImpl) AppendTrackLogWithFunc(module string, fn func() error, opts ...SpanOption) {
startTime := time.Now()
err := fn()
s.AppendTrackLog(module, startTime, err, opts...)
}
// AppendRPCTrackLog appends RPC track logs to baggage with default key fieldTrackLogKey.
func (s *spanImpl) AppendRPCTrackLog(logs []string) {
for _, trackLog := range logs {
s.track(trackLog)
}
}
// TrackLog returns track log, calls BaggageItem with default key fieldTrackLogKey.
func (s *spanImpl) TrackLog() []string {
return s.context.trackLogs()
}
func (s *spanImpl) track(value string) {
maxTracks := s.tracer.options.maxInternalTrack
for _, ref := range s.references {
spanCtx, ok := ref.ReferencedContext.(*SpanContext)
if !ok {
continue
}
spanCtx.append(maxTracks, value)
}
s.context.append(maxTracks, value)
}
// String returns traceID:spanID.
func (s *spanImpl) String() string {
return fmt.Sprintf("%s:%s", s.context.traceID, s.context.spanID)
}
// TraceID return traceID
func (s *spanImpl) TraceID() string {
return s.context.traceID
}
// -------------------------------------------------------------------
const (
defaultCalldepth = 3
)
func (s *spanImpl) output(lvl log.Level, v []interface{}) {
if log.DefaultLogger.GetOutputLevel() > lvl {
return
}
log.DefaultLogger.Output(s.String(), lvl, defaultCalldepth, v...)
}
func (s *spanImpl) outputf(lvl log.Level, format string, v []interface{}) {
if log.DefaultLogger.GetOutputLevel() > lvl {
return
}
log.DefaultLogger.Outputf(s.String(), lvl, defaultCalldepth, format, v...)
}
func (s *spanImpl) Println(v ...interface{}) { s.output(log.Linfo, v) }
func (s *spanImpl) Printf(format string, v ...interface{}) { s.outputf(log.Linfo, format, v) }
func (s *spanImpl) Debug(v ...interface{}) { s.output(log.Ldebug, v) }
func (s *spanImpl) Debugf(format string, v ...interface{}) { s.outputf(log.Ldebug, format, v) }
func (s *spanImpl) Info(v ...interface{}) { s.output(log.Linfo, v) }
func (s *spanImpl) Infof(format string, v ...interface{}) { s.outputf(log.Linfo, format, v) }
func (s *spanImpl) Warn(v ...interface{}) { s.output(log.Lwarn, v) }
func (s *spanImpl) Warnf(format string, v ...interface{}) { s.outputf(log.Lwarn, format, v) }
func (s *spanImpl) Error(v ...interface{}) { s.output(log.Lerror, v) }
func (s *spanImpl) Errorf(format string, v ...interface{}) { s.outputf(log.Lerror, format, v) }
func (s *spanImpl) Panic(v ...interface{}) {
str := fmt.Sprintln(v...)
s.output(log.Lpanic, v)
panic(s.String() + " -> " + str)
}
func (s *spanImpl) Panicf(format string, v ...interface{}) {
str := fmt.Sprintf(format, v...)
s.outputf(log.Lpanic, format, v)
panic(s.String() + " -> " + str)
}
func (s *spanImpl) Fatal(v ...interface{}) {
s.output(log.Lfatal, v)
os.Exit(1)
}
func (s *spanImpl) Fatalf(format string, v ...interface{}) {
s.outputf(log.Lfatal, format, v)
os.Exit(1)
}
// -------------------------------------------------------------------
type operationSpan struct {
Span
operation string
}
func (s *operationSpan) OperationName() string {
return s.operation
}
func (s *operationSpan) SetOperationName(operation string) opentracing.Span {
s.operation = operation
return s
}
func (s *operationSpan) WithOperation(operation string) Span {
op := s.OperationName()
if len(op) > 0 {
if len(operation) > 0 {
op = fmt.Sprintf("%s:%s", op, operation)
}
} else {
op = operation
}
return &operationSpan{
Span: s,
operation: op,
}
}
func (s *operationSpan) String() string {
span := s.Span
next := true
for next {
switch x := span.(type) {
case *operationSpan:
span = x.Span
default:
next = false
}
}
if op := s.OperationName(); op != "" {
return fmt.Sprintf("%s:%s", span.String(), op)
}
return span.String()
}
func (s *operationSpan) output(lvl log.Level, v []interface{}) {
if log.DefaultLogger.GetOutputLevel() > lvl {
return
}
log.DefaultLogger.Output(s.String(), lvl, defaultCalldepth, v...)
}
func (s *operationSpan) outputf(lvl log.Level, format string, v []interface{}) {
if log.DefaultLogger.GetOutputLevel() > lvl {
return
}
log.DefaultLogger.Outputf(s.String(), lvl, defaultCalldepth, format, v...)
}
func (s *operationSpan) Println(v ...interface{}) { s.output(log.Linfo, v) }
func (s *operationSpan) Printf(format string, v ...interface{}) { s.outputf(log.Linfo, format, v) }
func (s *operationSpan) Debug(v ...interface{}) { s.output(log.Ldebug, v) }
func (s *operationSpan) Debugf(format string, v ...interface{}) { s.outputf(log.Ldebug, format, v) }
func (s *operationSpan) Info(v ...interface{}) { s.output(log.Linfo, v) }
func (s *operationSpan) Infof(format string, v ...interface{}) { s.outputf(log.Linfo, format, v) }
func (s *operationSpan) Warn(v ...interface{}) { s.output(log.Lwarn, v) }
func (s *operationSpan) Warnf(format string, v ...interface{}) { s.outputf(log.Lwarn, format, v) }
func (s *operationSpan) Error(v ...interface{}) { s.output(log.Lerror, v) }
func (s *operationSpan) Errorf(format string, v ...interface{}) { s.outputf(log.Lerror, format, v) }
func (s *operationSpan) Panic(v ...interface{}) {
str := fmt.Sprintln(v...)
s.output(log.Lpanic, v)
panic(s.String() + " -> " + str)
}
func (s *operationSpan) Panicf(format string, v ...interface{}) {
str := fmt.Sprintf(format, v...)
s.outputf(log.Lpanic, format, v)
panic(s.String() + " -> " + str)
}
func (s *operationSpan) Fatal(v ...interface{}) {
s.output(log.Lfatal, v)
os.Exit(1)
}
func (s *operationSpan) Fatalf(format string, v ...interface{}) {
s.outputf(log.Lfatal, format, v)
os.Exit(1)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package trace
import (
"fmt"
"hash/maphash"
"sync"
)
const (
internalTrackLogKey = "internal-baggage-key-tracklog"
)
// ID used for spanID or traceID
type ID uint64
func (id ID) String() string {
return fmt.Sprintf("%016x", uint64(id))
}
// RandomID generate ID for traceID or spanID
func RandomID() ID {
return ID(new(maphash.Hash).Sum64())
}
// SpanContext implements opentracing.SpanContext
type SpanContext struct {
// traceID represents globally unique ID of the trace.
traceID string
// spanID represents span ID that must be unique within its trace.
spanID ID
// parentID refers to the ID of the parent span.
// Should be 0 if the current span is a root span.
parentID ID
// Distributed Context baggage.
baggage map[string][]string
sync.RWMutex
}
// ForeachBaggageItem implements opentracing.SpanContext API
func (s *SpanContext) ForeachBaggageItem(handler func(k, v string) bool) {
panic("not implements")
}
// ForeachBaggageItems will called the handler function for each baggage key/values pair.
func (s *SpanContext) ForeachBaggageItems(handler func(k string, v []string) bool) {
s.Lock()
defer s.Unlock()
for k, v := range s.baggage {
if !handler(k, v) {
break
}
}
}
func (s *SpanContext) setBaggageItem(key string, value []string) {
s.Lock()
defer s.Unlock()
if s.baggage == nil {
s.baggage = map[string][]string{key: value}
return
}
s.baggage[key] = value
}
func (s *SpanContext) trackLogs() []string {
return s.baggageItemDeepCopy(internalTrackLogKey)
}
func (s *SpanContext) append(maxTracks int, value string) {
s.Lock()
if s.baggage == nil {
s.baggage = make(map[string][]string)
}
if len(s.baggage[internalTrackLogKey]) < maxTracks {
s.baggage[internalTrackLogKey] = append(s.baggage[internalTrackLogKey], value)
}
s.Unlock()
}
func (s *SpanContext) baggageItem(key string) []string {
s.RLock()
defer s.RUnlock()
return s.baggage[key]
}
func (s *SpanContext) baggageItemDeepCopy(key string) (item []string) {
s.RLock()
item = append(item, s.baggage[key]...)
s.RUnlock()
return
}
// IsValid returns true if SpanContext is valid
func (s *SpanContext) IsValid() bool {
return s.traceID != "" && s.spanID != 0
}
// IsEmpty returns true is span context is empty
func (s *SpanContext) IsEmpty() bool {
return !s.IsValid() && len(s.baggage) == 0
}
// Copyright 2024 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package trace
import (
"time"
)
// TracerOption is a function that sets some option on the tracer
type SpanOption func(*spanOptions)
type spanOptionDuration uint8
const (
durationAny spanOptionDuration = iota
durationNone
durationNs
durationUs
durationMs // default
durationSecond
durationMinute
durationHour
)
func (d spanOptionDuration) Value(duration time.Duration) int64 {
var v int64
switch d {
case durationNone:
case durationNs:
v = duration.Nanoseconds()
case durationUs:
v = duration.Microseconds()
case durationMs:
v = duration.Milliseconds()
case durationSecond:
v = int64(duration / time.Second)
case durationMinute:
v = int64(duration / time.Minute)
case durationHour:
v = int64(duration / time.Hour)
}
return v
}
func (d spanOptionDuration) Unit(duration time.Duration) string {
switch d {
case durationNs:
return "ns"
case durationUs:
return "us"
case durationMs:
return "ms"
case durationSecond:
return "s"
case durationMinute:
return "m"
case durationHour:
return "h"
}
return ""
}
type spanOptions struct {
duration spanOptionDuration
durationUnit bool
errorLength int
}
func OptSpanDurationAny() SpanOption { return func(o *spanOptions) { o.duration = durationAny } }
func OptSpanDurationNone() SpanOption { return func(o *spanOptions) { o.duration = durationNone } }
func OptSpanDurationNs() SpanOption { return func(o *spanOptions) { o.duration = durationNs } }
func OptSpanDurationUs() SpanOption { return func(o *spanOptions) { o.duration = durationUs } }
func OptSpanDurationMs() SpanOption { return func(o *spanOptions) { o.duration = durationMs } }
func OptSpanDurationSecond() SpanOption { return func(o *spanOptions) { o.duration = durationSecond } }
func OptSpanDurationMinute() SpanOption { return func(o *spanOptions) { o.duration = durationMinute } }
func OptSpanDurationHour() SpanOption { return func(o *spanOptions) { o.duration = durationHour } }
func OptSpanDurationUnit() SpanOption { return func(o *spanOptions) { o.durationUnit = true } }
func OptSpanErrorLength(l int) SpanOption {
return func(o *spanOptions) {
if l >= 0 {
o.errorLength = l
}
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package trace
import (
"context"
"encoding/json"
"fmt"
"net/http"
"os"
"path"
"time"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/ext"
)
const (
defaultRootSpanName = ""
defaultMaxLogsPerSpan = 50
defaultInternalTrack = 64
)
// ChildOf is the alias of opentracing.ChildOf
var ChildOf = opentracing.ChildOf
// FollowsFrom is the alias of opentracing.FollowsFrom
var FollowsFrom = opentracing.FollowsFrom
// StartTime is alias of opentracing.StartTime.
type StartTime = opentracing.StartTime
// Tags are the expand of opentracing.Tags
type Tags opentracing.Tags
// Apply satisfies the StartSpanOption interface.
func (t Tags) Apply(options *opentracing.StartSpanOptions) {
if options.Tags == nil {
options.Tags = make(opentracing.Tags)
}
for k, v := range t {
options.Tags[k] = v
}
}
// ToSlice change tags to slice
func (t Tags) ToSlice() (ret []string) {
for k := range t {
ret = append(ret, k+":"+fmt.Sprint(t[k]))
}
return
}
// Marshal marshal tracer tags
func (t Tags) Marshal() (ret []byte, err error) {
ret, err = json.Marshal(t)
return
}
// Tag is the alias of opentracing.Tag,
type Tag = opentracing.Tag
// Options tracer options
type Options struct {
maxLogsPerSpan int
maxInternalTrack int
}
// Tracer implements opentracing.Tracer
type Tracer struct {
serviceName string
options Options
}
// init sets default global tracer
func init() {
tracer := NewTracer(path.Base(os.Args[0]))
SetGlobalTracer(tracer)
}
// NewTracer creates a tracer with serviceName
func NewTracer(serviceName string, opts ...TracerOption) *Tracer {
t := &Tracer{
serviceName: serviceName,
}
for _, option := range opts {
option(t)
}
if t.options.maxLogsPerSpan <= 0 {
t.options.maxLogsPerSpan = defaultMaxLogsPerSpan
}
if t.options.maxInternalTrack <= 0 {
t.options.maxInternalTrack = defaultInternalTrack
}
return t
}
// StartSpan implements StartSpan() method of opentracing.Tracer.
// Create, start, and return a new Span with the given `operationName` and
// incorporate the given StartSpanOption `opts`.
func (t *Tracer) StartSpan(operationName string, options ...opentracing.StartSpanOption) opentracing.Span {
sso := opentracing.StartSpanOptions{}
for _, o := range options {
o.Apply(&sso)
}
return t.startSpanWithOptions(operationName, sso)
}
func (t *Tracer) startSpanWithOptions(operationName string, opts opentracing.StartSpanOptions) Span {
startTime := opts.StartTime
if startTime.IsZero() {
startTime = time.Now()
}
var (
hasParent bool
parent *SpanContext
references []opentracing.SpanReference
ctx = &SpanContext{}
)
for _, reference := range opts.References {
spanCtx, ok := reference.ReferencedContext.(*SpanContext)
if !ok {
continue
}
if spanCtx == nil || spanCtx.IsEmpty() {
continue
}
if spanCtx.IsValid() {
references = append(references, reference)
}
if !hasParent {
parent = spanCtx
hasParent = reference.Type == opentracing.ChildOfRef
}
}
if !hasParent && parent != nil && !parent.IsEmpty() {
hasParent = true
}
if !hasParent || (parent != nil && !parent.IsValid()) {
ctx.traceID = RandomID().String()
ctx.spanID = RandomID()
ctx.parentID = 0
} else {
ctx.traceID = parent.traceID
ctx.spanID = RandomID()
ctx.parentID = parent.spanID
}
if hasParent {
// copy baggage items
parent.ForeachBaggageItems(func(k string, v []string) bool {
ctx.setBaggageItem(k, v)
return true
})
}
tags := opts.Tags
span := &spanImpl{
operationName: operationName,
startTime: startTime,
tags: tags,
context: ctx,
tracer: t,
references: references,
duration: 0,
}
span.rootSpan = ctx.parentID == 0
return span
}
// Inject implements Inject() method of opentracing.Tracer
func (t *Tracer) Inject(sc opentracing.SpanContext, format interface{}, carrier interface{}) error {
s, ok := sc.(*SpanContext)
if !ok {
return opentracing.ErrInvalidSpanContext
}
switch format {
case TextMap, HTTPHeaders:
return defaultTexMapPropagator.Inject(s, carrier)
default:
return ErrUnsupportedFormat
}
}
// Extract implements Extract() method of opentracing.Tracer
func (t *Tracer) Extract(format interface{}, carrier interface{}) (opentracing.SpanContext, error) {
switch format {
case TextMap, HTTPHeaders:
return defaultTexMapPropagator.Extract(carrier)
default:
return nil, ErrUnsupportedFormat
}
}
// Close releases all resources
func (t *Tracer) Close() error {
// TODO report span
return nil
}
// StartSpanFromContext starts and returns a Span with `operationName`, using
// any Span found within `ctx` as a ChildOfRef. If no such parent could be
// found, StartSpanFromContext creates a root (parentless) Span.
func StartSpanFromContext(ctx context.Context, operationName string, opts ...opentracing.StartSpanOption) (Span, context.Context) {
span, ctx := opentracing.StartSpanFromContext(ctx, operationName, opts...)
return span.(Span), ctx
}
// StartSpanFromContextWithTraceID starts and return a new span with `operationName` and traceID.
func StartSpanFromContextWithTraceID(ctx context.Context, operationName string, traceID string, opts ...opentracing.StartSpanOption) (Span, context.Context) {
span, ctx := opentracing.StartSpanFromContext(ctx, operationName, opts...)
s := span.(*spanImpl)
s.context.traceID = traceID
return s, ctx
}
// StartSpanFromHTTPHeaderSafe starts and return a Span with `operationName` and http.Request
func StartSpanFromHTTPHeaderSafe(r *http.Request, operationName string) (Span, context.Context) {
spanCtx, _ := Extract(HTTPHeaders, HTTPHeadersCarrier(r.Header))
traceID := r.Header.Get(RequestIDKey)
if traceID == "" {
return StartSpanFromContext(r.Context(), operationName, ext.RPCServerOption(spanCtx))
}
return StartSpanFromContextWithTraceID(r.Context(), operationName, traceID, ext.RPCServerOption(spanCtx))
}
// ContextWithSpan returns a new `context.Context` that holds a reference to
// the span. If span is nil, a new context without an active span is returned.
func ContextWithSpan(ctx context.Context, span Span) context.Context {
return opentracing.ContextWithSpan(ctx, span)
}
// SpanFromContext returns the `Span` previously associated with `ctx`, or
// `nil` if no such `Span` could be found.
func SpanFromContext(ctx context.Context) Span {
span := opentracing.SpanFromContext(ctx)
s, ok := span.(Span)
if !ok {
return nil
}
return s
}
// SpanFromContextSafe returns the `Span` previously associated with `ctx`, or
// creates a root Span with name default.
func SpanFromContextSafe(ctx context.Context) Span {
span := opentracing.SpanFromContext(ctx)
s, ok := span.(Span)
if !ok || s == nil {
return opentracing.GlobalTracer().StartSpan(defaultRootSpanName).(Span)
}
return s
}
// SetGlobalTracer sets the [singleton] opentracing.Tracer returned by
// GlobalTracer(). Those who use GlobalTracer (rather than directly manage an
// opentracing.Tracer instance) should call SetGlobalTracer as early as
// possible in main(), prior to calling the `StartSpan` global func below.
// Prior to calling `SetGlobalTracer`, any Spans started via the `StartSpan`
// (etc) globals are noops.
func SetGlobalTracer(tracer *Tracer) {
opentracing.SetGlobalTracer(tracer)
}
// CloseGlobalTracer closes global tracer gracefully.
func CloseGlobalTracer() {
tracer, ok := opentracing.GlobalTracer().(*Tracer)
if !ok {
return
}
tracer.Close()
}
// GlobalTracer returns the global singleton `Tracer` implementation.
func GlobalTracer() *Tracer {
t := opentracing.GlobalTracer()
return t.(*Tracer)
}
// Extract returns a SpanContext instance given `format` and `carrier`.
func Extract(format interface{}, carrier interface{}) (opentracing.SpanContext, error) {
return GlobalTracer().Extract(format, carrier)
}
// InjectWithHTTPHeader takes the `sm` SpanContext instance and injects it for
// propagation within `HTTPHeadersCarrier` and `HTTPHeaders`.
func InjectWithHTTPHeader(ctx context.Context, r *http.Request) error {
span := SpanFromContextSafe(ctx)
ext.SpanKindRPCClient.Set(span)
ext.HTTPMethod.Set(span, r.Method)
return span.Tracer().Inject(span.Context(), HTTPHeaders, HTTPHeadersCarrier(r.Header))
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package trace
// TracerOption is a function that sets some option on the tracer
type TracerOption func(tracer *Tracer)
// TracerOptions is a factory for all available TracerOption's
var TracerOptions tracerOptions
type tracerOptions struct{}
func (tracerOptions) MaxLogsPerSpan(maxLogsPerSpan int) TracerOption {
return func(tracer *Tracer) {
tracer.options.maxLogsPerSpan = maxLogsPerSpan
}
}
func (tracerOptions) MaxInternalTrackLog(internalTrack int) TracerOption {
return func(tracer *Tracer) {
tracer.options.maxInternalTrack = internalTrack
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bytespool
import "sync"
func newBytes(size int) func() interface{} {
return func() interface{} {
return make([]byte, size)
}
}
const (
zeroSize int = 1 << 14 // 16K
// 1K - 2K - 4K - 8K - 16K - 32K - 64K
numPools = 7
sizeStep = 2
startSize int = 1 << 10 // 1K
maxSize int = 1 << 16 // 64K
)
var (
zero = make([]byte, zeroSize)
pools [numPools]sync.Pool
poolSize [numPools]int
)
func init() {
size := startSize
for ii := 0; ii < numPools; ii++ {
pools[ii] = sync.Pool{
New: newBytes(size),
}
poolSize[ii] = size
size *= sizeStep
}
}
// GetPool returns a sync.Pool that generates bytes slice with the size.
// Return nil if no such pool exists.
func GetPool(size int) *sync.Pool {
for idx, psize := range poolSize {
if size <= psize {
return &pools[idx]
}
}
return nil
}
// Alloc returns a bytes slice with the size.
// Make a new bytes slice if oversize.
func Alloc(size int) []byte {
if pool := GetPool(size); pool != nil {
b := pool.Get().([]byte)
return b[:size]
}
return make([]byte, size)
}
// Free puts the bytes slice into suitable pool.
// Discard the bytes slice if oversize.
func Free(b []byte) {
size := cap(b)
if size > maxSize {
return
}
b = b[0:size]
for ii := numPools - 1; ii >= 0; ii-- {
if size >= poolSize[ii] {
pools[ii].Put(b) // nolint: staticcheck
return
}
}
}
// Zero clean up the bytes slice b to zero.
func Zero(b []byte) {
for len(b) > 0 {
n := copy(b, zero)
b = b[n:]
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package defaulter
// Set basic type's value to default.
// Epsilon of float is 1e-9.
import (
"fmt"
"math"
"reflect"
)
// Empty sets string value to default if it's empty.
func Empty(valPointer *string, defaultVal string) {
if *valPointer == "" {
*valPointer = defaultVal
}
}
// Equal sets basic value to default if it equal zero.
func Equal(valPointer interface{}, defaultVal interface{}) {
setDefault(valPointer, defaultVal, equalZero)
}
// Less sets basic value to default if it is less than zero.
func Less(valPointer interface{}, defaultVal interface{}) {
setDefault(valPointer, defaultVal, lessZero)
}
// LessOrEqual sets basic value to default if it is not greater than zero.
func LessOrEqual(valPointer interface{}, defaultVal interface{}) {
setDefault(valPointer, defaultVal, lessOrEqualZero)
}
func setDefault(valPointer interface{}, defaultVal interface{},
cmp func(reflect.Value, reflect.Kind) bool) {
typ := reflect.TypeOf(valPointer)
if typ.Kind() != reflect.Ptr {
panic(typ.Name() + " must be pointer")
}
typ = typ.Elem()
val := reflect.ValueOf(valPointer).Elem()
dTyp, dVal := parseDefault(defaultVal)
if typ.Kind() != dTyp.Kind() {
panic(fmt.Sprintf("not the same type %s != %s", typ.Kind().String(), dTyp.Kind().String()))
}
if cmp(val, typ.Kind()) {
val.Set(dVal)
}
}
func parseDefault(defaultVal interface{}) (reflect.Type, reflect.Value) {
typ, val := reflect.TypeOf(defaultVal), reflect.ValueOf(defaultVal)
if typ.Kind() == reflect.Ptr {
typ, val = typ.Elem(), val.Elem()
}
return typ, val
}
func equalZero(val reflect.Value, typ reflect.Kind) bool {
switch typ {
case reflect.Bool:
return !val.Bool()
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return val.Int() == 0
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return val.Uint() == 0
case reflect.Float32, reflect.Float64:
return math.Float64bits(val.Float()) == 0
default:
panic("equal zero unsupported type " + typ.String())
}
}
func lessZero(val reflect.Value, typ reflect.Kind) bool {
switch typ {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return val.Int() < 0
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return false
case reflect.Float32, reflect.Float64:
return val.Float() < -1e-9
default:
panic("less zero unsupported type " + typ.String())
}
}
func lessOrEqualZero(val reflect.Value, typ reflect.Kind) bool {
switch typ {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
return val.Int() <= 0
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
return val.Uint() == 0
case reflect.Float32, reflect.Float64:
return val.Float() < 1e-9
default:
panic("less or equal zero unsupported type " + typ.String())
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package errors
import (
"fmt"
"runtime"
"strconv"
"strings"
)
const prefix = " --> "
// Cause returns the cause of this error
func Cause(err error) error {
if e, ok := err.(interface{ Cause() error }); ok {
if diag := e.Cause(); diag != nil {
return diag
}
}
return err
}
// Detail returns detail of error, add prefix sign at the first
func Detail(err error) string {
if err == nil {
return ""
}
if e, ok := err.(interface{ Details() string }); ok {
return e.Details()
}
builder := strings.Builder{}
builder.WriteString(prefix)
builder.WriteString(err.Error())
return builder.String()
}
// Error error with detail
type Error struct {
Err error
Why error
File string
Line int
Cmd []interface{}
}
// Base returns a runtime.Caller(1) detail error based the error
func Base(err error, cmd ...interface{}) *Error {
_, file, line, ok := runtime.Caller(1)
if !ok {
file = "???"
}
return &Error{Err: Cause(err), Why: err, File: file, Line: line, Cmd: cmd}
}
// Info alias of Base, deprecated
func Info(err error, cmd ...interface{}) *Error {
_, file, line, ok := runtime.Caller(1)
if !ok {
file = "???"
}
return &Error{Err: Cause(err), Why: err, File: file, Line: line, Cmd: cmd}
}
// BaseEx returns a runtime.Caller(skip) detail error based the error.
// file and line tracing may have problems with go1.9,
// see related issue: https://github.com/golang/go/issues/22916
func BaseEx(skip int, err error, cmd ...interface{}) *Error {
oldErr := err
if e, ok := err.(*Error); ok {
err = e.Err
}
_, file, line, ok := runtime.Caller(skip)
if !ok {
file = "???"
}
return &Error{Err: Cause(err), Why: oldErr, File: file, Line: line, Cmd: cmd}
}
// InfoEx alias of BaseEx, deprecated
func InfoEx(skip int, err error, cmd ...interface{}) *Error {
return BaseEx(skip, err, cmd...)
}
// Cause returns the cause of this error
func (r *Error) Cause() error {
return r.Err
}
// Unwrap returns why of the error
func (r *Error) Unwrap() error {
return r.Why
}
// Error returns base error Error()
func (r *Error) Error() string {
if r.Err != nil {
return r.Err.Error()
}
return ""
}
// Details returns detail message of the error
func (r *Error) Details() string {
builder := strings.Builder{}
builder.WriteString(prefix)
builder.WriteString(r.File)
builder.WriteByte(':')
builder.WriteString(strconv.Itoa(r.Line))
builder.WriteByte(' ')
builder.WriteString(r.Error())
if len(r.Cmd) > 0 {
builder.WriteString(" ~ ")
builder.WriteString(stringJoin(r.Cmd...))
}
if r.Why != nil && r.Why != r.Err {
builder.WriteString(Detail(r.Why))
}
return builder.String()
}
// Detail returns Error with why
func (r *Error) Detail(err error) *Error {
r.Why = err
return r
}
func stringJoin(v ...interface{}) string {
builder := strings.Builder{}
for idx, value := range v {
if idx > 0 {
builder.WriteByte(' ')
}
builder.WriteString(fmt.Sprintf("%+v", value))
}
return builder.String()
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package errors
import (
"errors"
"fmt"
)
// New alias of errors.New
func New(msg string) error {
return errors.New(msg)
}
// Newf alias of fmt.Errorf
func Newf(format string, a ...interface{}) error {
return fmt.Errorf(format, a...)
}
// Newx returns error with multi message
func Newx(v ...interface{}) error {
return errors.New(stringJoin(v...))
}
// As alias of errors.As
func As(err error, target interface{}) bool {
return errors.As(err, target)
}
// Is alias of errors.Is
func Is(err, target error) bool {
return errors.Is(err, target)
}
// Unwrap alias of errors.Unwrap
func Unwrap(err error) error {
return errors.Unwrap(err)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package log
import (
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
)
// defines log level
const (
Ldebug Level = iota
Linfo
Lwarn
Lerror
Lpanic
Lfatal
maxLevel
)
// Level type log level
type Level int
// UnmarshalJSON unserialize log level with json.
// Try compatible digit firstly then string.
func (l *Level) UnmarshalJSON(data []byte) error {
if lvl, err := strconv.Atoi(string(data)); err == nil {
if lvl < 0 || lvl >= int(maxLevel) {
return fmt.Errorf("invalid log level: %s", string(data))
}
*l = Level(lvl)
return nil
}
var lvlName string
json.Unmarshal(data, &lvlName)
lvl, exist := levelMapping[strings.ToLower(lvlName)]
if !exist {
return fmt.Errorf("invalid log level: %s", string(data))
}
*l = lvl
return nil
}
// UnmarshalYAML unserialize log level with yaml.
func (l *Level) UnmarshalYAML(unmarshal func(interface{}) error) error {
var lvlName string
unmarshal(&lvlName)
lvl, exist := levelMapping[strings.ToLower(lvlName)]
if !exist {
return fmt.Errorf("invalid log level: %s", lvlName)
}
*l = lvl
return nil
}
var levelMapping = map[string]Level{
"debug": Ldebug,
"info": Linfo,
"warn": Lwarn,
"error": Lerror,
"panic": Lpanic,
"fatal": Lfatal,
}
var levelToStrings = []string{
"[DEBUG]",
"[INFO]",
"[WARN]",
"[ERROR]",
"[PANIC]",
"[FATAL]",
}
// DefaultLogger default logger initial with os.Stderr.
var DefaultLogger Logger
// BaseLogger defines interface of application log apis.
type BaseLogger interface {
Printf(format string, v ...interface{})
Println(v ...interface{})
Debugf(format string, v ...interface{})
Debug(v ...interface{})
Infof(format string, v ...interface{})
Info(v ...interface{})
Warnf(format string, v ...interface{})
Warn(v ...interface{})
Errorf(format string, v ...interface{})
Error(v ...interface{})
Fatalf(format string, v ...interface{})
Fatal(v ...interface{})
Panicf(format string, v ...interface{})
Panic(v ...interface{})
}
// Logger a implemented logger should implements all these function.
type Logger interface {
BaseLogger
// atomically control log level
GetOutputLevel() Level
SetOutputLevel(logLevel Level)
SetOutput(w io.Writer)
Output(id string, lvl Level, calldepth int, a ...interface{}) error
Outputf(id string, lvl Level, calldepth int, format string, a ...interface{}) error
// implement raft Logger with these two function
Warningf(format string, v ...interface{})
Warning(v ...interface{})
}
func init() {
DefaultLogger = New(os.Stderr, 3)
}
// ChangeDefaultLevelHandler returns http handler of default log level modify API
func ChangeDefaultLevelHandler() (string, http.HandlerFunc) {
return "/log/level", func(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case http.MethodGet:
level := DefaultLogger.GetOutputLevel()
w.Write([]byte(fmt.Sprintf("{\"level\": \"%s\"}", levelToStrings[level])))
case http.MethodPost:
if err := r.ParseForm(); err != nil {
w.WriteHeader(http.StatusBadRequest)
return
}
var level Level
lvlName := r.FormValue("level")
if lvl, ok := levelMapping[lvlName]; ok {
level = lvl
} else if err := level.UnmarshalJSON([]byte(lvlName)); err != nil {
w.WriteHeader(http.StatusBadRequest)
return
}
DefaultLogger.SetOutputLevel(Level(level))
default:
w.WriteHeader(http.StatusMethodNotAllowed)
}
}
}
func Printf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Linfo, format, v) }
func Println(v ...interface{}) { DefaultLogger.(*logger).output(Linfo, v) }
func Debugf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Ldebug, format, v) }
func Debug(v ...interface{}) { DefaultLogger.(*logger).output(Ldebug, v) }
func Infof(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Linfo, format, v) }
func Info(v ...interface{}) { DefaultLogger.(*logger).output(Linfo, v) }
func Warnf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Lwarn, format, v) }
func Warn(v ...interface{}) { DefaultLogger.(*logger).output(Lwarn, v) }
func Errorf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Lerror, format, v) }
func Error(v ...interface{}) { DefaultLogger.(*logger).output(Lerror, v) }
func Fatalf(format string, v ...interface{}) {
DefaultLogger.(*logger).outputf(Lfatal, format, v)
os.Exit(1)
}
func Fatal(v ...interface{}) {
DefaultLogger.(*logger).output(Lfatal, v)
os.Exit(1)
}
func Panicf(format string, v ...interface{}) {
s := fmt.Sprintf(format, v...)
DefaultLogger.(*logger).outputf(Lpanic, format, v)
panic(s)
}
func Panic(v ...interface{}) {
s := fmt.Sprintln(v...)
DefaultLogger.(*logger).output(Lpanic, v)
panic(s)
}
func GetOutputLevel() Level { return DefaultLogger.GetOutputLevel() }
func SetOutputLevel(lvl Level) { DefaultLogger.SetOutputLevel(lvl) }
func SetOutput(w io.Writer) { DefaultLogger.SetOutput(w) }
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package log
import (
"bytes"
"fmt"
"io"
"os"
"runtime"
"sync"
"sync/atomic"
"time"
)
type logger struct {
level int32
calldepth int
writer atomic.Value
pool sync.Pool
}
type logWriter struct {
io.Writer
}
// New return a logger with default level Linfo.
// output buffer with reused bytes pool.
func New(out io.Writer, calldepth int) Logger {
l := &logger{
level: int32(Linfo),
calldepth: calldepth,
pool: sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
},
}
l.writer.Store(&logWriter{out})
return l
}
func (l *logger) Output(id string, lvl Level, calldepth int, a ...interface{}) error {
if int32(lvl) < atomic.LoadInt32(&l.level) || lvl >= maxLevel {
return nil
}
_, file, line, ok := runtime.Caller(calldepth)
if !ok {
file = "???"
line = 0
}
return l.write(id, lvl, file, line, fmt.Sprintln(a...))
}
func (l *logger) Outputf(id string, lvl Level, calldepth int, format string, a ...interface{}) error {
if int32(lvl) < atomic.LoadInt32(&l.level) || lvl >= maxLevel {
return nil
}
_, file, line, ok := runtime.Caller(calldepth)
if !ok {
file = "???"
line = 0
}
return l.write(id, lvl, file, line, fmt.Sprintf(format, a...))
}
func (l *logger) write(id string, lvl Level, file string, line int, s string) error {
now := time.Now()
buf := l.pool.Get().(*bytes.Buffer)
buf.Reset()
l.formatOutput(buf, now, file, line, lvl)
if id != "" {
buf.WriteByte('[')
buf.WriteString(id)
buf.WriteByte(']')
buf.WriteByte(' ')
}
buf.WriteString(s)
if len(s) > 0 && s[len(s)-1] != '\n' {
buf.WriteByte('\n')
}
out := l.writer.Load().(io.Writer)
_, err := out.Write(buf.Bytes())
l.pool.Put(buf)
return err
}
// -----------------------------------------
func (l *logger) outputf(lvl Level, format string, v []interface{}) {
l.Outputf("", lvl, l.calldepth, format, v...)
}
func (l *logger) output(lvl Level, v []interface{}) {
l.Output("", lvl, l.calldepth, v...)
}
func (l *logger) Printf(format string, v ...interface{}) { l.outputf(Linfo, format, v) }
func (l *logger) Println(v ...interface{}) { l.output(Linfo, v) }
func (l *logger) Debugf(format string, v ...interface{}) { l.outputf(Ldebug, format, v) }
func (l *logger) Debug(v ...interface{}) { l.output(Ldebug, v) }
func (l *logger) Infof(format string, v ...interface{}) { l.outputf(Linfo, format, v) }
func (l *logger) Info(v ...interface{}) { l.output(Linfo, v) }
func (l *logger) Warnf(format string, v ...interface{}) { l.outputf(Lwarn, format, v) }
func (l *logger) Warn(v ...interface{}) { l.output(Lwarn, v) }
func (l *logger) Warningf(format string, v ...interface{}) { l.outputf(Lwarn, format, v) }
func (l *logger) Warning(v ...interface{}) { l.output(Lwarn, v) }
func (l *logger) Errorf(format string, v ...interface{}) { l.outputf(Lerror, format, v) }
func (l *logger) Error(v ...interface{}) { l.output(Lerror, v) }
func (l *logger) Fatalf(format string, v ...interface{}) {
l.outputf(Lfatal, format, v)
os.Exit(1)
}
func (l *logger) Fatal(v ...interface{}) {
l.output(Lfatal, v)
os.Exit(1)
}
func (l *logger) Panicf(format string, v ...interface{}) {
s := fmt.Sprintf(format, v...)
l.outputf(Lpanic, format, v)
panic(s)
}
func (l *logger) Panic(v ...interface{}) {
s := fmt.Sprintln(v...)
l.output(Lpanic, v)
panic(s)
}
// -----------------------------------------
func (l *logger) GetOutputLevel() Level {
return Level(atomic.LoadInt32(&l.level))
}
func (l *logger) SetOutput(w io.Writer) {
l.writer.Store(&logWriter{w})
}
func (l *logger) SetOutputLevel(lvl Level) {
if lvl >= maxLevel {
lvl = Lfatal
}
atomic.StoreInt32(&l.level, int32(lvl))
}
func (l *logger) formatOutput(buf *bytes.Buffer, t time.Time, file string, line int, lvl Level) {
year, month, day := t.Date()
itoa(buf, year, 4)
buf.WriteByte('/')
itoa(buf, int(month), 2)
buf.WriteByte('/')
itoa(buf, day, 2)
buf.WriteByte(' ')
hour, min, sec := t.Clock()
itoa(buf, hour, 2)
buf.WriteByte(':')
itoa(buf, min, 2)
buf.WriteByte(':')
itoa(buf, sec, 2)
buf.WriteByte('.')
itoa(buf, t.Nanosecond()/1e3, 6)
buf.WriteByte(' ')
buf.WriteString(levelToStrings[lvl])
buf.WriteByte(' ')
buf.WriteString(file)
buf.WriteByte(':')
itoa(buf, line, -1)
buf.WriteByte(' ')
}
// itoa cheap integer to fixed width decimal ASCII.
// a negative width to avoid zero-padding.
// the buffer has enough capacity.
func itoa(buf *bytes.Buffer, i int, width int) {
u := uint(i)
if u == 0 && width <= 1 {
buf.WriteByte('0')
return
}
// assemble decimal in reverse order
var b [32]byte
bp := len(b)
for ; u > 0 || width > 0; u /= 10 {
bp--
width--
b[bp] = byte(u%10) + '0'
}
// avoid slicing b to make an allocation
buf.Write(b[bp:])
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package retry
import (
"context"
"time"
)
// Insist successfully on f.
// On error every duration if has error.
// Sleep duration time after f run.
func Insist(duration time.Duration, f func() error, onError func(error)) {
err := f()
if err == nil {
return
}
onError(err)
timer := time.NewTimer(duration)
defer timer.Stop()
<-timer.C
for {
err = f()
if err == nil {
return
}
onError(err)
timer.Reset(duration)
<-timer.C
}
}
// InsistContext successfully on f or done with context.
// On error every duration if has error.
// Sleep duration time after f run.
func InsistContext(ctx context.Context, duration time.Duration, f func() error, onError func(error)) {
err := f()
if err == nil {
return
}
onError(err)
timer := time.NewTimer(duration)
defer timer.Stop()
select {
case <-ctx.Done():
return
case <-timer.C:
}
for {
err = f()
if err == nil {
return
}
onError(err)
timer.Reset(duration)
select {
case <-ctx.Done():
return
case <-timer.C:
}
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package retry
import (
"errors"
"time"
)
var (
// ErrRetryFailed all retry attempts failed.
ErrRetryFailed = errors.New("retry: all retry attempts failed")
// ErrRetryNext retry next on interrupt.
ErrRetryNext = errors.New("retry: retry next on interrupt")
)
// Retryer is an interface retry on a specific function.
type Retryer interface {
// On performs a retry on function, until it doesn't return any error.
On(func() error) error
// RuptOn performs a retry on function, until it doesn't return any error or interrupt.
RuptOn(func() (bool, error)) error
}
type retry struct {
attempts int
nextDelay func() uint32
}
// On implements Retryer.On.
func (r *retry) On(caller func() error) error {
var lastErr error
attempt := 1
for attempt <= r.attempts {
if lastErr = caller(); lastErr == nil {
return nil
}
// do not wait on last useless delay
if attempt >= r.attempts {
break
}
time.Sleep(time.Duration(r.nextDelay()) * time.Millisecond)
attempt++
}
return lastErr
}
// RuptOn implements Retryer.RuptOn.
func (r *retry) RuptOn(caller func() (bool, error)) error {
var lastErr error
attempt := 1
for attempt <= r.attempts {
interrupted, err := caller()
if err == nil {
return nil
}
// return last error of method, if interrupted
if err != ErrRetryNext {
lastErr = err
}
if interrupted {
break
}
// do not wait on last useless delay
if attempt >= r.attempts {
break
}
time.Sleep(time.Duration(r.nextDelay()) * time.Millisecond)
attempt++
}
return lastErr
}
// Timed returns a retry with fixed interval delay.
func Timed(attempts int, delay uint32) Retryer {
return &retry{
attempts: attempts,
nextDelay: func() uint32 {
return delay
},
}
}
// ExponentialBackoff returns a retry with exponential delay.
func ExponentialBackoff(attempts int, expDelay uint32) Retryer {
next := expDelay
return &retry{
attempts: attempts,
nextDelay: func() uint32 {
r := next
next += expDelay
return r
},
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package task
import "context"
var (
// C alias of Concurrent
C = Concurrent
// CC alias of ConcurrentContext
CC = ConcurrentContext
)
// Concurrent is tasks run concurrently.
func Concurrent(f func(index int, arg interface{}), args []interface{}) {
ConcurrentContext(context.Background(), f, args)
}
// ConcurrentContext is tasks run concurrently with context.
// How to make []interface{} see: https://golang.org/doc/faq#convert_slice_of_interface
func ConcurrentContext(ctx context.Context, f func(index int, arg interface{}), args []interface{}) {
tasks := make([]func() error, len(args))
for ii := 0; ii < len(args); ii++ {
index, arg := ii, args[ii]
tasks[ii] = func() error {
f(index, arg)
return nil
}
}
Run(ctx, tasks...)
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package task
import (
"context"
)
type semaphore struct {
ready chan struct{}
}
func newSemaphore(n int) *semaphore {
s := &semaphore{
ready: make(chan struct{}, n),
}
for ii := 0; ii < n; ii++ {
s.ready <- struct{}{}
}
return s
}
func (s *semaphore) Wait() <-chan struct{} {
return s.ready
}
func (s *semaphore) Signal() {
s.ready <- struct{}{}
}
// Run executes list of tasks in parallel,
// returns the first error or nil if all tasks done.
func Run(ctx context.Context, tasks ...func() error) error {
n := len(tasks)
semaphore := newSemaphore(n)
errorCh := make(chan error, 1)
for _, task := range tasks {
<-semaphore.Wait()
go func(task func() error) {
err := task()
if err == nil {
semaphore.Signal()
return
}
select {
case errorCh <- err:
default:
}
}(task)
}
for ii := 0; ii < n; ii++ {
select {
case err := <-errorCh:
return err
case <-ctx.Done():
return ctx.Err()
case <-semaphore.Wait():
}
}
return nil
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package version
import (
"fmt"
"io/ioutil"
"os"
)
var (
version string = ""
fPerm os.FileMode = 0o600
)
func init() {
if len(os.Args) > 1 && os.Args[1] == "-version" {
fmt.Println("version:", version)
os.Exit(0)
}
writeFile(".version", version)
}
func Version() string {
return version
}
func writeFile(fname, field string) {
if field != "" {
ioutil.WriteFile(fname, []byte(field), fPerm)
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bcache
import (
"os"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
const (
_ int = iota
statusOK
statusNoent
statusError
)
type BcacheClient struct {
connPool *ConnPool
}
var (
once sync.Once
client *BcacheClient
)
func NewBcacheClient() *BcacheClient {
once.Do(func() {
expireTime := int64(time.Second * ConnectExpireTime)
cp := NewConnPool(UnixSocketPath, 20, 200, expireTime)
client = &BcacheClient{connPool: cp}
})
return client
}
func (c *BcacheClient) Get(key string, buf []byte, offset uint64, size uint32) (int, error) {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("bcache-get", err, bgTime, 1)
}()
req := &GetCacheRequest{
CacheKey: key,
Offset: offset,
Size: size,
}
packet := NewBlockCachePacket()
packet.Opcode = OpBlockCacheGet
err = packet.MarshalData(req)
if err != nil {
log.LogDebugf("get block cache: req(%v) err(%v)", req.CacheKey, err)
return 0, err
}
stat.EndStat("bcache-get-marshal", err, bgTime, 1)
conn, err := c.connPool.Get()
if err != nil {
log.LogDebugf("get block cache: get Conn failed, req(%v) err(%v)", req.CacheKey, err)
return 0, err
}
defer func() {
c.connPool.Put(conn)
}()
stat.EndStat("bcache-get-conn", err, bgTime, 1)
err = packet.WriteToConn(*conn)
if err != nil {
log.LogDebugf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
return 0, errors.NewErrorf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
}
stat.EndStat("bcache-get-writeconn", err, bgTime, 1)
err = packet.ReadFromConn(*conn, 1)
if err != nil {
log.LogDebugf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
return 0, errors.NewErrorf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
}
stat.EndStat("bcache-get-readconn", err, bgTime, 1)
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogDebugf("get block cache: req(%v) err(%v) result(%v)", req.CacheKey, err, packet.GetResultMsg())
return 0, err
}
resp := new(GetCachePathResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogDebugf("get block cache: req(%v) err(%v) PacketData(%v)", req.CacheKey, err, string(packet.Data))
return 0, err
}
cachePath := resp.CachePath
stat.EndStat("bcache-get-meta", err, bgTime, 1)
readBgTime := stat.BeginStat()
subs := strings.Split(cachePath, "/")
if subs[len(subs)-1] != key {
log.LogDebugf("cacheKey(%v) cache path(%v) is not legal",
key, cachePath)
return 0, errors.NewErrorf("cacheKey(%v) cache path is not legal: %v", key, cachePath)
}
f, err := os.Open(cachePath)
if err != nil {
return 0, err
}
defer f.Close()
n, err := f.ReadAt(buf, int64(offset))
if n != int(size) {
log.LogDebugf("get block cache: BCache client GET() error,exception size(%v),but readSize(%v)", size, n)
return 0, errors.NewErrorf("BcacheClient GET() error, exception size(%v), but readSize(%v)", size, n)
}
if err != nil {
log.LogDebugf("get block cache: BCache client read %v err %v", cachePath, err.Error())
return 0, errors.NewErrorf("get block cache: BCache client read %v err %v", cachePath, err.Error())
}
encryptXOR(buf[:n])
stat.EndStat("bcache-get-read", err, readBgTime, 1)
return n, nil
}
func (c *BcacheClient) Put(key string, buf []byte) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("bcache-put", err, bgTime, 1)
}()
req := &PutCacheRequest{
CacheKey: key,
Data: buf,
}
packet := NewBlockCachePacket()
packet.Opcode = OpBlockCachePut
err = packet.MarshalData(req)
if err != nil {
log.LogDebugf("put block cache: req(%v) err(%v)", req.CacheKey, err)
return err
}
conn, err := c.connPool.Get()
if err != nil {
log.LogDebugf("put block cache: get Conn failed, req(%v) err(%v)", req.CacheKey, err)
return err
}
defer func() {
c.connPool.Put(conn)
}()
err = packet.WriteToConn(*conn)
if err != nil {
log.LogDebugf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
return errors.NewErrorf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
}
err = packet.ReadFromConn(*conn, proto.NoReadDeadlineTime)
if err != nil {
log.LogDebugf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
return errors.NewErrorf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogDebugf("put block cache: req(%v) err(%v) result(%v)", req.CacheKey, err, packet.GetResultMsg())
return err
}
return err
}
func (c *BcacheClient) Evict(key string) error {
req := &DelCacheRequest{CacheKey: key}
packet := NewBlockCachePacket()
packet.Opcode = OpBlockCacheDel
err := packet.MarshalData(req)
if err != nil {
log.LogDebugf("del block cache: req(%v) err(%v)", req.CacheKey, err)
return err
}
conn, err := c.connPool.Get()
if err != nil {
log.LogDebugf("del block cache: get Conn failed, req(%v) err(%v)", req.CacheKey, err)
return err
}
defer func() {
c.connPool.Put(conn)
}()
err = packet.WriteToConn(*conn)
if err != nil {
return err
}
err = packet.ReadFromConn(*conn, proto.NoReadDeadlineTime)
if err != nil {
return err
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("del block cache: req(%v) err(%v) result(%v)", req.CacheKey, err, packet.GetResultMsg())
return err
}
log.LogDebugf("del block cache success: req(%v)", req.CacheKey)
return nil
}
func parseStatus(result uint8) (status int) {
switch result {
case proto.OpOk:
status = statusOK
case proto.OpNotExistErr:
status = statusNoent
default:
status = statusError
}
return
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bcache
import (
"net"
"time"
)
const (
DefaultTimeOut = 1 * time.Second
ConnectExpireTime = 20
)
type ConnObject struct {
c *net.Conn
lastActive int64
}
type ConnPool struct {
conns chan *ConnObject
mincap int
maxcap int
expire int64
target string
}
func NewConnPool(target string, mincap, maxcap int, expire int64) *ConnPool {
p := &ConnPool{
conns: make(chan *ConnObject, maxcap),
mincap: mincap,
maxcap: maxcap,
expire: expire,
target: target,
}
return p
}
func (connPool *ConnPool) Get() (c *net.Conn, err error) {
var o *ConnObject
for {
select {
case o = <-connPool.conns:
default:
return connPool.NewConnect(connPool.target)
}
if time.Now().UnixNano()-o.lastActive > connPool.expire {
_ = (*o.c).Close()
o = nil
continue
}
return o.c, nil
}
}
func (connPool *ConnPool) NewConnect(target string) (*net.Conn, error) {
conn, err := net.DialTimeout("unix", target, DefaultTimeOut)
return &conn, err
}
func (connPool *ConnPool) Put(c *net.Conn) {
o := &ConnObject{
c: c,
lastActive: time.Now().UnixNano(),
}
select {
case connPool.conns <- o:
return
default:
if o.c != nil {
(*o.c).Close()
}
return
}
}
// Copyright (C) 2020 Juicefs
// Modified work Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bcache
import (
"bufio"
"bytes"
"container/list"
"crypto/md5"
"encoding/hex"
"fmt"
"hash/crc32"
"io"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
const (
PathListSeparator = ";"
CacheConfSeparator = ":"
SpaceCheckInterval = 60 * time.Second
TmpFileCheckInterval = 20 * 60 * time.Second
FilePerm = 0o644
Basedir = "blocks"
)
type ReadCloser interface {
io.Reader
io.ReaderAt
io.Closer
}
type BcacheManager interface {
cache(key string, data []byte, direct bool)
read(key string, offset uint64, len uint32) (io.ReadCloser, error)
queryCachePath(key string, offset uint64, len uint32) (string, error)
load(key string) (ReadCloser, error)
erase(key string)
stats() (int64, int64)
}
func newBcacheManager(conf *bcacheConfig) BcacheManager {
log.LogInfof("init block cache: %s size:%d GB", conf.CacheDir, conf.BlockSize)
if conf.CacheDir == "" {
log.LogWarnf("no cache config,cacheDirs or size is empty!")
return nil
}
// todo cachedir reg match
cacheDirs := strings.Split(conf.CacheDir, PathListSeparator)
if len(cacheDirs) == 0 {
log.LogWarnf("no cache dir config!")
return nil
}
dirSizeMap := make(map[string]int64, len(cacheDirs))
for _, dir := range cacheDirs {
result := strings.Split(dir, CacheConfSeparator)
dirPath := result[0]
cacheSize, err := strconv.Atoi(result[1])
if dirPath == "" || err != nil {
log.LogWarnf("cache dir config error!")
return nil
}
dirSizeMap[dirPath] = int64(cacheSize)
conf.CacheSize = conf.CacheSize + int64(cacheSize)
}
bm := &bcacheManager{
bstore: make([]*DiskStore, len(cacheDirs)),
bcacheKeys: make(map[string]*list.Element),
lrulist: list.New(),
blockSize: conf.BlockSize,
pending: make(chan waitFlush, 1024),
}
index := 0
for cacheDir, cacheSize := range dirSizeMap {
disk := NewDiskStore(cacheDir, cacheSize, conf)
bm.bstore[index] = disk
go bm.reBuildCacheKeys(cacheDir, disk)
index++
}
go bm.spaceManager()
go bm.flush()
// go bm.scrub()
return bm
}
type cacheItem struct {
key string
size uint32
}
type keyPair struct {
key string
it *cacheItem
}
// key vid_inode_offset
type waitFlush struct {
Key string
Data []byte
}
type bcacheManager struct {
sync.RWMutex
bcacheKeys map[string]*list.Element
lrulist *list.List
bstore []*DiskStore
blockSize uint32
pending chan waitFlush
}
func encryptXOR(data []byte) {
for index, value := range data {
data[index] = value ^ byte(0xF)
}
}
func (bm *bcacheManager) queryCachePath(key string, offset uint64, len uint32) (path string, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("GetCache:GetCachePath", err, bgTime, 1)
}()
bm.Lock()
element, ok := bm.bcacheKeys[key]
bm.Unlock()
if ok {
item := element.Value.(*cacheItem)
path, err := bm.getCachePath(key)
if err != nil {
return "", err
}
bm.Lock()
bm.lrulist.MoveToBack(element)
bm.Unlock()
log.LogDebugf("Cache item found. key=%v offset =%v,len=%v size=%v, path=%v", key, offset, len, item.size, path)
return path, nil
}
log.LogDebugf("Cache item not found. key=%v offset =%v,len=%v", key, offset, len)
return "", os.ErrNotExist
}
func (bm *bcacheManager) getCachePath(key string) (string, error) {
if len(bm.bstore) == 0 {
return "", errors.New("no cache dir")
}
cachePath := bm.selectDiskKv(key).getPath(key)
return cachePath, nil
}
func (bm *bcacheManager) cache(key string, data []byte, direct bool) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Cache:Write", nil, bgTime, 1)
stat.StatBandWidth("Cache", uint32(len(data)))
}()
log.LogDebugf("TRACE cache. key(%v) len(%v) direct(%v)", key, len(data), direct)
if direct {
bm.cacheDirect(key, data)
return
}
select {
case bm.pending <- waitFlush{Key: key, Data: data}:
default:
log.LogDebugf("pending chan is full,skip memory. key =%v,len=%v bytes", key, len(data))
bm.cacheDirect(key, data)
}
}
func (bm *bcacheManager) cacheDirect(key string, data []byte) {
diskKv := bm.selectDiskKv(key)
if diskKv.flushKey(key, data) == nil {
bm.Lock()
item := &cacheItem{
key: key,
size: uint32(len(data)),
}
element := bm.lrulist.PushBack(item)
bm.bcacheKeys[key] = element
bm.Unlock()
}
}
func (bm *bcacheManager) read(key string, offset uint64, len uint32) (io.ReadCloser, error) {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("GetCache:Read", err, bgTime, 1)
if err == nil {
stat.StatBandWidth("GetCache:Read", len)
}
}()
metaBgTime := stat.BeginStat()
bm.Lock()
element, ok := bm.bcacheKeys[key]
bm.Unlock()
stat.EndStat("GetCache:Read:GetMeta", nil, metaBgTime, 1)
log.LogDebugf("Trace read. ok =%v", ok)
if ok {
item := element.Value.(*cacheItem)
f, err := bm.load(key)
if os.IsNotExist(err) {
bm.Lock()
delete(bm.bcacheKeys, key)
bm.Unlock()
d := bm.selectDiskKv(key)
atomic.AddInt64(&d.usedSize, -int64(item.size))
atomic.AddInt64(&d.usedCount, -1)
return nil, os.ErrNotExist
}
if err != nil {
return nil, err
}
defer f.Close()
size := item.size
log.LogDebugf("read. offset =%v,len=%v size=%v", offset, len, size)
if uint32(offset)+len > size {
len = size - uint32(offset)
}
dataBgTime := stat.BeginStat()
buf := make([]byte, len)
n, err := f.ReadAt(buf, int64(offset))
stat.EndStat("GetCache:Read:ReadData", err, dataBgTime, 1)
if err != nil {
return nil, err
} else {
// decrypt
encryptXOR(buf[:n])
return io.NopCloser(bytes.NewBuffer(buf[:n])), nil
}
} else {
err = os.ErrNotExist
}
return nil, err
}
func (bm *bcacheManager) load(key string) (ReadCloser, error) {
if len(bm.bstore) == 0 {
return nil, errors.New("no cache dir")
}
f, err := bm.selectDiskKv(key).load(key)
if err != nil {
return nil, err
}
bm.Lock()
defer bm.Unlock()
if element, ok := bm.bcacheKeys[key]; ok {
bm.lrulist.MoveToBack(element)
}
return f, err
}
func (bm *bcacheManager) erase(key string) {
if len(bm.bstore) == 0 {
return
}
err := bm.selectDiskKv(key).remove(key)
if err == nil {
bm.Lock()
defer bm.Unlock()
if element, ok := bm.bcacheKeys[key]; ok {
bm.lrulist.Remove(element)
}
delete(bm.bcacheKeys, key)
}
}
func (bm *bcacheManager) stats() (int64, int64) {
var usedCount, usedSize int64
for _, item := range bm.bstore {
usedSize += atomic.LoadInt64(&item.usedSize)
usedCount += atomic.LoadInt64(&item.usedCount)
}
return usedCount, usedSize
}
func (bm *bcacheManager) selectDiskKv(key string) *DiskStore {
return bm.bstore[hashKey(key)%uint32(len(bm.bstore))]
}
func (bm *bcacheManager) spaceManager() {
ticker := time.NewTicker(SpaceCheckInterval)
tmpTicker := time.NewTicker(TmpFileCheckInterval)
defer func() {
ticker.Stop()
tmpTicker.Stop()
}()
for {
select {
case <-ticker.C:
for _, store := range bm.bstore {
useRatio, files := store.diskUsageRatio()
log.LogDebugf("useRation(%v), files(%v)", useRatio, files)
if 1-useRatio < store.freeLimit || files > int64(store.limit) {
bm.freeSpace(store, 1-useRatio, files)
}
}
case <-tmpTicker.C:
for _, store := range bm.bstore {
useRatio, files := store.diskUsageRatio()
log.LogInfof("useRation(%v), files(%v)", useRatio, files)
bm.deleteTmpFile(store)
}
}
}
}
//lru cache
//func (bm *bcacheManager) freeSpace(index int, store *DiskStore, free float32, files int64) {
// var decreaseSpace int64
// var decreaseCnt int
// storeCnt := uint32(len(bm.bstore))
// bm.Lock()
// defer bm.Unlock()
// if free < store.freeLimit {
// decreaseSpace = int64((store.freeLimit - free) * (float32(store.capacity)))
// }
// if files > int64(store.limit) {
// decreaseCnt = int(files - int64(store.limit))
// }
// var lastKey string
// var lastItem cacheItem
// var cnt int
// for key, value := range bm.bcacheKeys {
// if int(hashKey(key)%storeCnt) == index {
// if cnt == 0 || lastItem.atime > value.atime {
// lastKey = key
// lastItem = value
// }
// cnt++
// if cnt > 1 {
// store.remove(lastKey)
// delete(bm.bcacheKeys, lastKey)
// decreaseSpace -= int64(value.size)
// decreaseCnt--
// cnt = 0
// log.LogDebugf("remove %s from cache, age: %d", lastKey, lastItem.atime)
// if decreaseCnt <= 0 && decreaseSpace <= 0 {
// break
// }
// }
// }
// }
//
//}
// lru
func (bm *bcacheManager) freeSpace(store *DiskStore, free float32, files int64) {
var decreaseSpace int64
var decreaseCnt int
if free < store.freeLimit {
decreaseSpace = int64((store.freeLimit - free) * (float32(store.capacity)))
}
if files > int64(store.limit) {
decreaseCnt = int(files - int64(store.limit))
}
cnt := 0
for {
if decreaseCnt <= 0 && decreaseSpace <= 0 {
break
}
// avoid dead loop
if cnt > 500000 {
break
}
bm.Lock()
element := bm.lrulist.Front()
if element == nil {
bm.Unlock()
return
}
item := element.Value.(*cacheItem)
if err := store.remove(item.key); err == nil {
bm.lrulist.Remove(element)
delete(bm.bcacheKeys, item.key)
decreaseSpace -= int64(item.size)
decreaseCnt--
cnt++
}
bm.Unlock()
log.LogDebugf("remove %v from cache", item.key)
}
}
func (bm *bcacheManager) reBuildCacheKeys(dir string, store *DiskStore) {
if _, err := os.Stat(dir); err != nil {
log.LogErrorf("cache dir %s is not exists", dir)
return
}
log.LogDebugf("reBuildCacheKeys(%s)", dir)
c := make(chan keyPair)
keyPrefix := filepath.Join(dir, Basedir)
go func() {
filepath.Walk(dir, bm.walker(c, keyPrefix, true))
close(c)
}()
for value := range c {
bm.Lock()
element := bm.lrulist.PushBack(value.it)
bm.bcacheKeys[value.key] = element
bm.Unlock()
log.LogDebugf("updateStat(%v)", value.it.size)
store.updateStat(value.it.size)
}
}
func (bm *bcacheManager) walker(c chan keyPair, prefix string, initial bool) filepath.WalkFunc {
return func(path string, info os.FileInfo, err error) error {
if err != nil {
log.LogWarnf("walk path %v failed %v", path, err)
return err
}
if info.IsDir() || !strings.HasPrefix(path, prefix) {
return nil
}
if strings.HasSuffix(path, ".tmp") && (initial || checkoutTempFileOuttime(path)) {
os.Remove(path)
log.LogDebugf("Remove tmp file %v", path)
return nil
}
_, key := filepath.Split(path)
size := uint32(info.Size())
pair := keyPair{
key: key,
it: &cacheItem{
key: key,
size: size,
},
}
c <- pair
return nil
}
}
func (bm *bcacheManager) flush() {
for {
pending := <-bm.pending
diskKv := bm.selectDiskKv(pending.Key)
log.LogDebugf("flush data,key(%v), dir(%v)", pending.Key, diskKv.dir)
if diskKv.flushKey(pending.Key, pending.Data) == nil {
bm.Lock()
item := &cacheItem{
key: pending.Key,
size: uint32(len(pending.Data)),
}
element := bm.lrulist.PushBack(item)
bm.bcacheKeys[pending.Key] = element
bm.Unlock()
}
}
}
func hashKey(key string) uint32 {
return crc32.ChecksumIEEE([]byte(key))
}
type DiskStore struct {
sync.Mutex
dir string
mode uint32
capacity int64
freeLimit float32
limit uint32
usedSize int64
usedCount int64
}
func NewDiskStore(dir string, cacheSize int64, config *bcacheConfig) *DiskStore {
if config.Mode == 0 {
config.Mode = FilePerm
}
if config.FreeRatio <= 0 {
config.FreeRatio = 0.15
}
if config.Limit <= 0 {
config.Limit = 50000000
}
if config.Limit > 50000000 {
config.Limit = 50000000
}
c := &DiskStore{
dir: dir,
mode: config.Mode,
capacity: cacheSize,
freeLimit: config.FreeRatio,
limit: config.Limit,
}
log.LogDebugf("ignored method DiskStore.scrub at %p", c.scrub) // TODO: ignored
c.checkBuildCacheDir(dir)
return c
}
func (d *DiskStore) checkBuildCacheDir(dir string) {
mode := os.FileMode(d.mode)
if st, err := os.Stat(dir); os.IsNotExist(err) {
if parent := filepath.Dir(dir); parent != dir {
d.checkBuildCacheDir(parent)
}
os.Mkdir(dir, mode)
} else if err != nil && st.Mode() != mode {
os.Chmod(dir, mode)
}
}
//func (d *DiskStore) flushKey(key string, data []byte) error {
// var err error
// bgTime := stat.BeginStat()
// defer func() {
// stat.EndStat("Cache:Write:FlushData", err, bgTime, 1)
// }()
// cachePath := d.buildCachePath(key, d.dir)
// log.LogDebugf("TRACE BCacheService flushKey Enter. key(%v) cachePath(%v)", key, cachePath)
// d.checkBuildCacheDir(filepath.Dir(cachePath))
// tmp := cachePath + ".tmp"
// f, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(d.mode))
// defer os.Remove(tmp)
// if err != nil {
// log.LogErrorf("Create block tmp file:%s err:%s!", tmp, err)
// return err
// }
// //encrypt
// encryptXOR(data)
// _, err = f.Write(data)
// if err != nil {
// f.Close()
// log.LogErrorf("Write tmp failed: file %s err %s!", tmp, err)
// return err
// }
// err = f.Close()
// if err != nil {
// log.LogErrorf("Close tmp failed: file:%s err:%s!", tmp, err)
// return err
// }
// info, err := os.Stat(cachePath)
// //if already cached
// if !os.IsNotExist(err) {
// atomic.AddInt64(&d.usedSize, -(info.Size()))
// atomic.AddInt64(&d.usedCount, -1)
// os.Remove(cachePath)
// }
// err = os.Rename(tmp, cachePath)
// if err != nil {
// log.LogErrorf("Rename block tmp file:%s err:%s!", tmp, err)
// return err
// }
// atomic.AddInt64(&d.usedSize, int64(len(data)))
// atomic.AddInt64(&d.usedCount, 1)
// log.LogDebugf("TRACE BCacheService flushKey Exit. key(%v) cachePath(%v)", key, cachePath)
// return nil
//
//}
func (d *DiskStore) flushKey(key string, data []byte) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Cache:Write:FlushData", err, bgTime, 1)
}()
cachePath := d.buildCachePath(key, d.dir)
info, err := os.Stat(cachePath)
// if already cached
if err == nil && info.Size() > 0 {
return nil
}
log.LogDebugf("TRACE BCacheService flushKey Enter. key(%v) cachePath(%v)", key, cachePath)
d.checkBuildCacheDir(filepath.Dir(cachePath))
tmp := cachePath + ".tmp"
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(d.mode))
defer os.Remove(tmp)
if err != nil {
log.LogWarnf("Create block tmp file:%s err:%s!", tmp, err)
return err
}
// encrypt
encryptXOR(data)
_, err = f.Write(data)
if err != nil {
f.Close()
log.LogErrorf("Write tmp failed: file %s err %s!", tmp, err)
return err
}
err = f.Close()
if err != nil {
log.LogErrorf("Close tmp failed: file:%s err:%s!", tmp, err)
return err
}
//info, err := os.Stat(cachePath)
////if already cached
//if !os.IsNotExist(err) {
// atomic.AddInt64(&d.usedSize, -(info.Size()))
// atomic.AddInt64(&d.usedCount, -1)
// os.Remove(cachePath)
//}
err = os.Rename(tmp, cachePath)
if err != nil {
log.LogErrorf("Rename block tmp file:%s err:%s!", tmp, err)
return err
}
atomic.AddInt64(&d.usedSize, int64(len(data)))
atomic.AddInt64(&d.usedCount, 1)
log.LogDebugf("TRACE BCacheService flushKey Exit. key(%v) cachePath(%v)", key, cachePath)
return nil
}
func (d *DiskStore) load(key string) (ReadCloser, error) {
cachePath := d.buildCachePath(key, d.dir)
log.LogDebugf("TRACE BCacheService load Enter. key(%v) cachePath(%v)", key, cachePath)
//if _, err := os.Stat(cachePath); err != nil {
// return nil, errors.NewError(os.ErrNotExist)
//}
f, err := os.OpenFile(cachePath, os.O_RDONLY, os.FileMode(d.mode))
log.LogDebugf("TRACE BCacheService load Exit. err(%v)", err)
return f, err
}
func (d *DiskStore) remove(key string) (err error) {
var size int64
cachePath := d.buildCachePath(key, d.dir)
log.LogDebugf("remove. cachePath(%v)", cachePath)
if info, err := os.Stat(cachePath); err == nil {
size = info.Size()
if err = os.Remove(cachePath); err == nil {
atomic.AddInt64(&d.usedSize, -size)
atomic.AddInt64(&d.usedCount, -1)
}
}
return err
}
func (d *DiskStore) buildCachePath(key string, dir string) string {
inodeId, err := strconv.ParseInt(strings.Split(key, "_")[1], 10, 64)
if err != nil {
return fmt.Sprintf("%s/blocks/%d/%d/%s", dir, hashKey(key)&0xFFF%512, hashKey(key)%512, key)
}
return fmt.Sprintf("%s/blocks/%d/%d/%s", dir, hashKey(key)&0xFFF%512, inodeId%512, key)
}
func (d *DiskStore) diskUsageRatio() (float32, int64) {
log.LogDebugf("usedSize(%v), usedCount(%v)", atomic.LoadInt64(&d.usedSize), atomic.LoadInt64(&d.usedCount))
if atomic.LoadInt64(&d.usedSize) < 0 || atomic.LoadInt64(&d.usedCount) < 0 {
return 0, 0
}
return float32(atomic.LoadInt64(&d.usedSize)) / float32(d.capacity), atomic.LoadInt64(&d.usedCount)
}
func (d *DiskStore) scrub(key string, md5Sum string) error {
defer func() {
if r := recover(); r != nil {
return
}
}()
cachePath := d.buildCachePath(key, d.dir)
f, err := os.Open(cachePath)
if err != nil {
return err
}
defer f.Close()
r := bufio.NewReader(f)
h := md5.New()
_, err = io.Copy(h, r)
if err != nil {
return err
}
if md5Sum != hex.EncodeToString(h.Sum(nil)) {
return errors.New("scrub error")
}
return nil
}
func (d *DiskStore) updateStat(size uint32) {
atomic.AddInt64(&d.usedSize, int64(size))
atomic.AddInt64(&d.usedCount, 1)
}
func (d *DiskStore) getPath(key string) string {
cachePath := d.buildCachePath(key, d.dir)
return cachePath
}
func (bm *bcacheManager) deleteTmpFile(store *DiskStore) {
if _, err := os.Stat(store.dir); err != nil {
log.LogErrorf("cache dir %s is not exists", store.dir)
return
}
log.LogDebugf("clear tmp files in %v", store.dir)
c := make(chan keyPair)
keyPrefix := filepath.Join(store.dir, Basedir)
log.LogDebugf("keyPrefix %v", keyPrefix)
go func() {
filepath.Walk(store.dir, bm.walker(c, keyPrefix, false))
close(c)
}()
// consume chan
for range c {
}
log.LogDebugf("clear tmp files end%v", store.dir)
}
func checkoutTempFileOuttime(file string) bool {
finfo, err := os.Stat(file)
if err != nil {
return false
}
stat_t := finfo.Sys().(*syscall.Stat_t)
now := time.Now()
return now.Sub(timespecToTime(stat_t.Ctim)).Seconds() > 60*60 // 1 hour
}
func timespecToTime(ts syscall.Timespec) time.Time {
return time.Unix(int64(ts.Sec), int64(ts.Nsec))
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bcache
import (
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"strconv"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/buf"
)
const (
OpBlockCachePut uint8 = 0xB1
OpBlockCacheGet uint8 = 0xB2
OpBlockCacheDel uint8 = 0xB3
)
const (
CacheMagic uint8 = 0xFF
)
const (
PacketHeaderSize = 11
)
var Buffers *buf.BufferPool
type PutCacheRequest struct {
CacheKey string `json:"key"`
Data []byte `json:"data"`
}
type GetCacheRequest struct {
CacheKey string `json:"key"`
Offset uint64 `json:"offset"`
Size uint32 `json:"size"`
}
type GetCachePathResponse struct {
CachePath string `json:"path"`
}
type GetCacheDataResponse struct {
Data []byte `json:"data"`
}
type DelCacheRequest struct {
CacheKey string `json:"key"`
}
type BlockCachePacket struct {
Magic uint8
Opcode uint8
ResultCode uint8 // 3
CRC uint32
Size uint32 // Data's size ; header size: 3 + 8 = 11
Data []byte
StartT int64
}
func NewBlockCachePacket() *BlockCachePacket {
p := new(BlockCachePacket)
p.Magic = CacheMagic
p.StartT = time.Now().UnixNano()
return p
}
func (p *BlockCachePacket) String() string {
return fmt.Sprintf("OpMsg(%v)", p.GetOpMsg())
}
func (p *BlockCachePacket) GetOpMsg() (m string) {
switch p.Opcode {
case OpBlockCachePut:
m = "OpBlockCachePut"
case OpBlockCacheGet:
m = "OpBlockCacheGet"
case OpBlockCacheDel:
m = "OpBlockCacheDel"
default:
// do nothing
}
return
}
func (p *BlockCachePacket) GetResultMsg() (m string) {
if p == nil {
return ""
}
switch p.ResultCode {
case proto.OpErr:
m = "Err: " + string(p.Data)
case proto.OpOk:
m = "Ok"
case proto.OpNotExistErr:
m = "NotExistErr"
default:
return fmt.Sprintf("Unknown ResultCode(%v)", p.ResultCode)
}
return
}
func (p *BlockCachePacket) MarshalHeader(out []byte) {
out[0] = p.Magic
out[1] = p.Opcode
out[2] = p.ResultCode
binary.BigEndian.PutUint32(out[3:7], p.CRC)
binary.BigEndian.PutUint32(out[7:11], p.Size)
}
func (p *BlockCachePacket) UnMarshalHeader(in []byte) error {
p.Magic = in[0]
if p.Magic != CacheMagic {
return errors.New("Bad Magic " + strconv.Itoa(int(p.Magic)))
}
p.Opcode = in[1]
p.ResultCode = in[2]
p.CRC = binary.BigEndian.Uint32(in[3:7])
p.Size = binary.BigEndian.Uint32(in[7:11])
return nil
}
func (p *BlockCachePacket) MarshalData(v interface{}) error {
data, err := json.Marshal(v)
if err == nil {
p.Data = data
p.Size = uint32(len(p.Data))
}
return err
}
func (p *BlockCachePacket) UnmarshalData(v interface{}) error {
return json.Unmarshal(p.Data, v)
}
func (p *BlockCachePacket) WriteToConn(c net.Conn) (err error) {
header, err := Buffers.Get(PacketHeaderSize)
if err != nil {
header = make([]byte, PacketHeaderSize)
}
defer Buffers.Put(header)
c.SetWriteDeadline(time.Now().Add(proto.WriteDeadlineTime * time.Second))
p.MarshalHeader(header)
if _, err = c.Write(header); err == nil {
if p.Data != nil {
_, err = c.Write(p.Data[:p.Size])
}
}
return
}
func (p *BlockCachePacket) ReadFromConn(c net.Conn, timeoutSec int) (err error) {
if timeoutSec != proto.NoReadDeadlineTime {
c.SetReadDeadline(time.Now().Add(time.Second * time.Duration(timeoutSec)))
} else {
c.SetReadDeadline(time.Time{})
}
header, err := Buffers.Get(PacketHeaderSize)
if err != nil {
header = make([]byte, PacketHeaderSize)
}
defer Buffers.Put(header)
var n int
if n, err = io.ReadFull(c, header); err != nil {
return
}
if n != PacketHeaderSize {
return syscall.EBADMSG
}
if err = p.UnMarshalHeader(header); err != nil {
return
}
size := p.Size
//if p.Opcode == OpBlockCachePut || p.Opcode == OpBlockCacheDel {
// size = 0
//}
p.Data = make([]byte, size)
if n, err = io.ReadFull(c, p.Data[:size]); err != nil {
return err
}
if n != int(size) {
return syscall.EBADMSG
}
return nil
}
func (p *BlockCachePacket) PacketOkReplay() {
p.ResultCode = proto.OpOk
p.Size = 0
p.Data = nil
}
func (p *BlockCachePacket) PacketOkWithBody(reply []byte) {
p.Size = uint32(len(reply))
p.Data = make([]byte, p.Size)
copy(p.Data[:p.Size], reply)
p.ResultCode = proto.OpOk
}
func (p *BlockCachePacket) PacketErrorWithBody(code uint8, reply []byte) {
p.Size = uint32(len(reply))
p.Data = make([]byte, p.Size)
copy(p.Data[:p.Size], reply)
p.ResultCode = code
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bcache
import (
"encoding/json"
"fmt"
"io"
"net"
"os"
"path/filepath"
"runtime"
"strconv"
"github.com/cubefs/cubefs/cmd/common"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
UnixSocketPath = "/var/run/cubefscache/bcache.socket"
// config
CacheDir = "cacheDir"
CacheLimit = "cacheLimit"
CacheFree = "cacheFree"
BlockSize = "blockSize"
MaxFileSize = 128 << 30
MaxBlockSize = 128 << 20
BigExtentSize = 32 << 20
)
type bcacheConfig struct {
CacheDir string
BlockSize uint32
Mode uint32
CacheSize int64
FreeRatio float32
Limit uint32
}
type bcacheStore struct {
bcache BcacheManager
conf *bcacheConfig
control common.Control
stopC chan struct{}
}
func NewServer() *bcacheStore {
return &bcacheStore{}
}
func (s *bcacheStore) Start(cfg *config.Config) (err error) {
runtime.GOMAXPROCS(runtime.NumCPU())
return s.control.Start(s, cfg, doStart)
}
func (s *bcacheStore) Shutdown() {
s.control.Shutdown(s, doShutdown)
}
func (s *bcacheStore) Sync() {
s.control.Sync()
}
func doStart(server common.Server, cfg *config.Config) (err error) {
s, ok := server.(*bcacheStore)
if !ok {
return errors.New("Invalid node Type!")
}
// parse the config file
var bconf *bcacheConfig
bconf, err = s.parserConf(cfg)
if err != nil {
err = errors.NewErrorf("block config parser error.")
panic(err)
}
// start bcache manage
bm := newBcacheManager(bconf)
if bm == nil {
err = errors.NewErrorf("block cache manager init fail.")
panic(err)
}
s.bcache = bm
s.conf = bconf
// start unix domain socket
s.startServer()
return
}
func doShutdown(server common.Server) {
s, ok := server.(*bcacheStore)
if !ok {
return
}
// stop unix domain socket
s.stopServer()
// close connpool
}
func (s *bcacheStore) startServer() (err error) {
// create socket dir
os.MkdirAll(filepath.Dir(UnixSocketPath), FilePerm)
if _, err := os.Stat(UnixSocketPath); err == nil {
existErr := fmt.Sprintf("Another process is running or %s already exist,force delete it.", UnixSocketPath)
log.LogErrorf(existErr)
os.Remove(UnixSocketPath)
}
s.stopC = make(chan struct{})
ln, err := net.Listen("unix", UnixSocketPath)
if err != nil {
panic(err)
}
go func(stopC chan struct{}) {
defer ln.Close()
for {
conn, err := ln.Accept()
select {
case <-stopC:
return
default:
}
if err != nil {
continue
}
go s.serveConn(conn, stopC)
}
}(s.stopC)
log.LogInfof("start blockcache server.")
return
}
func (s *bcacheStore) stopServer() {
if s.stopC != nil {
defer func() {
if r := recover(); r != nil {
log.LogErrorf("action[StopBcacheServer],err:%v", r)
}
}()
close(s.stopC)
}
}
func (s *bcacheStore) serveConn(conn net.Conn, stopC chan struct{}) {
defer conn.Close()
for {
select {
case <-stopC:
return
default:
}
p := &BlockCachePacket{}
if err := p.ReadFromConn(conn, proto.NoReadDeadlineTime); err != nil {
if err != io.EOF {
log.LogDebugf("serve BcacheServer: %v", err.Error())
}
return
}
if err := s.handlePacket(conn, p); err != nil {
log.LogDebugf("serve handlePacket fail: %v", err)
}
}
}
func (s *bcacheStore) handlePacket(conn net.Conn, p *BlockCachePacket) (err error) {
switch p.Opcode {
case OpBlockCachePut:
err = s.opBlockCachePut(conn, p)
case OpBlockCacheGet:
err = s.opBlockCacheGet(conn, p)
case OpBlockCacheDel:
err = s.opBlockCacheEvict(conn, p)
default:
err = fmt.Errorf("unknown Opcode: %d", p.Opcode)
}
return
}
func (s *bcacheStore) opBlockCachePut(conn net.Conn, p *BlockCachePacket) (err error) {
req := &PutCacheRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
s.response(conn, p)
err = errors.NewErrorf("req[%v],err[%v]", req, err.Error())
return
}
s.bcache.cache(req.CacheKey, req.Data, false)
p.PacketOkReplay()
s.response(conn, p)
return
}
func (s *bcacheStore) opBlockCacheGet(conn net.Conn, p *BlockCachePacket) (err error) {
req := &GetCacheRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
s.response(conn, p)
err = errors.NewErrorf("req[%v],err[%v]", req, string(p.Data))
return
}
cachePath, err := s.bcache.queryCachePath(req.CacheKey, req.Offset, req.Size)
if err != nil {
if err == os.ErrNotExist {
p.PacketErrorWithBody(proto.OpNotExistErr, ([]byte)(err.Error()))
} else {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
}
s.response(conn, p)
err = errors.NewErrorf("req[%v],err[%v]", req, string(p.Data))
return
}
resp := &GetCachePathResponse{CachePath: cachePath}
reply, err := json.Marshal(resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
s.response(conn, p)
err = errors.NewErrorf("req[%v],err[%v]", req, string(p.Data))
return
}
p.PacketOkWithBody(reply)
s.response(conn, p)
return
}
func (s *bcacheStore) opBlockCacheEvict(conn net.Conn, p *BlockCachePacket) (err error) {
req := &DelCacheRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
s.response(conn, p)
err = errors.NewErrorf("req[%v],err[%v]", req, err.Error())
return
}
s.bcache.erase(req.CacheKey)
p.PacketOkReplay()
s.response(conn, p)
return
}
func (s *bcacheStore) response(conn net.Conn, p *BlockCachePacket) (err error) {
defer func() {
if r := recover(); r != nil {
switch data := r.(type) {
case error:
err = data
default:
err = errors.New(data.(string))
}
}
}()
err = p.WriteToConn(conn)
if err != nil {
log.LogDebugf("response to client[%s], "+
"request[%s]",
err.Error(), p.GetOpMsg())
}
return
}
func (s *bcacheStore) parserConf(cfg *config.Config) (*bcacheConfig, error) {
bconf := &bcacheConfig{}
cacheDir := cfg.GetString(CacheDir)
cacheLimit := cfg.GetString(CacheLimit)
cacheFree := cfg.GetString(CacheFree)
blockSize := cfg.GetString(BlockSize)
bconf.CacheDir = cacheDir
if cacheDir == "" {
return nil, errors.NewErrorf("cacheDir is required.")
}
if v, err := strconv.ParseUint(blockSize, 10, 32); err == nil {
bconf.BlockSize = uint32(v)
}
if v, err := strconv.ParseUint(cacheLimit, 10, 32); err == nil {
bconf.Limit = uint32(v)
}
if v, err := strconv.ParseFloat(cacheFree, 32); err == nil {
bconf.FreeRatio = float32(v)
}
return bconf, nil
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package bcache
import (
"os"
"syscall"
)
func AccessTime(info os.FileInfo) int64 {
linuxFileAttr := info.Sys().(*syscall.Stat_t)
return linuxFileAttr.Atim.Sec
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package common
import "time"
type Retry struct {
retryTimes int
delayTime uint32
}
func (r Retry) On(caller func() error) error {
var lastErr error
for i := 0; i < r.retryTimes; i++ {
err := caller()
if err == nil {
return nil
}
i++
time.Sleep(time.Duration(r.delayTime) * time.Millisecond)
}
return lastErr
}
// return a Retry
// delayTime is millisecond
func Timed(retryTimes int, delayTime uint32) Retry {
return Retry{
retryTimes: retryTimes,
delayTime: delayTime,
}
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
// Package taskpool provides limited pool running task
package common
// TaskPool limited pool
type TaskPool struct {
pool chan func()
}
// New returns task pool with workerCount and poolSize
func New(workerCount, poolSize int) TaskPool {
pool := make(chan func(), poolSize)
for i := 0; i < workerCount; i++ {
go func() {
for {
task, ok := <-pool
if !ok {
break
}
task()
}
}()
}
return TaskPool{pool: pool}
}
// Run add task to pool, block if pool is full
func (tp TaskPool) Run(task func()) {
tp.pool <- task
}
// TryRun try to add task to pool, return immediately
func (tp TaskPool) TryRun(task func()) bool {
select {
case tp.pool <- task:
return true
default:
return false
}
}
// Close the pool, the function is concurrent unsafe
func (tp TaskPool) Close() {
close(tp.pool)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"syscall"
"time"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
"github.com/cubefs/cubefs/proto"
)
const (
RootInode = proto.RootIno
)
const (
DefaultBlksize = uint32(1) << 12
DefaultMaxNameLen = uint32(256)
)
const (
DefaultInodeExpiration = 120 * time.Second
MaxInodeCache = 10000000 // in terms of the number of items
DefaultMaxInodeCache = 2000000
)
const (
// the expiration duration of the dentry in the cache (used internally)
DentryValidDuration = 5 * time.Second
DefaultReaddirLimit = 1024
)
const (
DeleteExtentsTimeout = 600 * time.Second
)
const (
MaxSizePutOnce = int64(1) << 23
)
const (
DefaultFlag = 0x0f
)
var (
// The following two are used in the FUSE cache
// every time the lookup will be performed on the fly, and the result will not be cached
LookupValidDuration = 5 * time.Second
// the expiration duration of the attributes in the FUSE cache
AttrValidDuration = 30 * time.Second
DisableMetaCache = true
)
// ParseError returns the error type.
func ParseError(err error) fuse.Errno {
switch v := err.(type) {
case syscall.Errno:
return fuse.Errno(v)
case fuse.Errno:
return v
default:
return fuse.EIO
}
}
// ParseType returns the dentry type.
func ParseType(t uint32) fuse.DirentType {
if proto.IsDir(t) {
return fuse.DT_Dir
} else if proto.IsSymlink(t) {
return fuse.DT_Link
}
return fuse.DT_File
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"sync"
"time"
)
// DentryCache defines the dentry cache.
type DentryCache struct {
sync.Mutex
cache map[string]uint64
expiration time.Time
}
// NewDentryCache returns a new dentry cache.
func NewDentryCache() *DentryCache {
return &DentryCache{
cache: make(map[string]uint64),
expiration: time.Now().Add(DentryValidDuration),
}
}
// Put puts an item into the cache.
func (dc *DentryCache) Put(name string, ino uint64) {
if dc == nil {
return
}
dc.Lock()
defer dc.Unlock()
dc.cache[name] = ino
dc.expiration = time.Now().Add(DentryValidDuration)
}
// Get gets the item from the cache based on the given key.
func (dc *DentryCache) Get(name string) (uint64, bool) {
if dc == nil {
return 0, false
}
dc.Lock()
defer dc.Unlock()
if dc.expiration.Before(time.Now()) {
dc.cache = make(map[string]uint64)
return 0, false
}
ino, ok := dc.cache[name]
return ino, ok
}
// Delete deletes the item based on the given key.
func (dc *DentryCache) Delete(name string) {
if dc == nil {
return
}
dc.Lock()
defer dc.Unlock()
delete(dc.cache, name)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"container/list"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
const (
// MinDentryCacheEvictNum is used in the foreground eviction.
// When clearing the inodes from the cache, it stops as soon as 10 inodes have been evicted.
MinDentryCacheEvictNum = 10
// MaxDentryCacheEvictNum is used in the back ground. We can evict 200000 inodes at max.
MaxDentryCacheEvictNum = 200000
DentryBgEvictionInterval = 2 * time.Minute
)
// Dcache defines the structure of the inode cache.
type Dcache struct {
sync.RWMutex
cache map[string]*list.Element
lruList *list.List
expiration time.Duration
maxElements int
}
// NewDentryCache returns a new inode cache.
func NewDcache(exp time.Duration, maxElements int) *Dcache {
dc := &Dcache{
cache: make(map[string]*list.Element),
lruList: list.New(),
expiration: exp,
maxElements: maxElements,
}
go dc.backgroundEviction()
return dc
}
// Put puts the given inode info into the inode cache.
func (dc *Dcache) Put(info *proto.DentryInfo) {
dc.Lock()
old, ok := dc.cache[info.Name]
if ok {
dc.lruList.Remove(old)
delete(dc.cache, info.Name)
}
if dc.lruList.Len() >= dc.maxElements {
dc.evict(true)
}
dentrySetExpiration(info, dc.expiration)
element := dc.lruList.PushFront(info)
dc.cache[info.Name] = element
dc.Unlock()
// log.LogDebugf("Dcache put inode: inode(%v)", info.Inode)
}
// Get returns the inode info based on the given inode number.
func (dc *Dcache) Get(name string) *proto.DentryInfo {
dc.RLock()
element, ok := dc.cache[name]
if !ok {
dc.RUnlock()
return nil
}
info := element.Value.(*proto.DentryInfo)
if dentryExpired(info) && DisableMetaCache {
dc.RUnlock()
// log.LogDebugf("Dcache GetConnect expired: now(%v) inode(%v), expired(%d)", time.Now().Format(LogTimeFormat), info.Inode, info.Expiration())
return nil
}
dc.RUnlock()
return info
}
// Delete deletes the dentry info based on the given name(partentId+name).
func (dc *Dcache) Delete(name string) {
// log.LogDebugf("Dcache Delete: ino(%v)", ino)
dc.Lock()
element, ok := dc.cache[name]
if ok {
dc.lruList.Remove(element)
delete(dc.cache, name)
}
dc.Unlock()
}
// Foreground eviction cares more about the speed.
// Background eviction evicts all expired items from the cache.
// The caller should grab the WRITE lock of the inode cache.
func (dc *Dcache) evict(foreground bool) {
var count int
for i := 0; i < MinDentryCacheEvictNum; i++ {
element := dc.lruList.Back()
if element == nil {
return
}
// For background eviction, if all expired items have been evicted, just return
// But for foreground eviction, we need to evict at least MinDentryCacheEvictNum inodes.
// The foreground eviction, does not need to care if the inode has expired or not.
info := element.Value.(*proto.DentryInfo)
if !foreground && !dentryExpired(info) {
return
}
// log.LogDebugf("Dcache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
dc.lruList.Remove(element)
delete(dc.cache, info.Name)
count++
}
// For background eviction, we need to continue evict all expired items from the cache
if foreground {
return
}
for i := 0; i < MaxDentryCacheEvictNum; i++ {
element := dc.lruList.Back()
if element == nil {
break
}
info := element.Value.(*proto.DentryInfo)
if !dentryExpired(info) {
break
}
// log.LogDebugf("Dcache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
dc.lruList.Remove(element)
delete(dc.cache, info.Name)
count++
}
}
func (dc *Dcache) backgroundEviction() {
t := time.NewTicker(DentryBgEvictionInterval)
defer t.Stop()
for range t.C {
log.LogInfof("Dcache: start BG evict")
if !DisableMetaCache {
log.LogInfof("Dcache: no need to do BG evict")
continue
}
start := time.Now()
dc.Lock()
dc.evict(false)
dc.Unlock()
elapsed := time.Since(start)
log.LogInfof("Dcache: total inode cache(%d), cost(%d)ns", dc.lruList.Len(), elapsed.Nanoseconds())
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"syscall"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
)
func isDirectIOEnabled(flags fuse.OpenFlags) bool {
return (int(flags) & syscall.O_DIRECT) != 0
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"context"
"fmt"
"io"
"os"
"path"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
"github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/meta"
"github.com/cubefs/cubefs/util/auditlog"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
// used to locate the position in parent
type DirContext struct {
Name string
}
type DirContexts struct {
sync.RWMutex
dirCtx map[fuse.HandleID]*DirContext
}
func NewDirContexts() (dctx *DirContexts) {
dctx = &DirContexts{}
dctx.dirCtx = make(map[fuse.HandleID]*DirContext, 0)
return
}
func (dctx *DirContexts) GetCopy(handle fuse.HandleID) DirContext {
dctx.RLock()
dirCtx, found := dctx.dirCtx[handle]
dctx.RUnlock()
if found {
return DirContext{dirCtx.Name}
} else {
return DirContext{}
}
}
func (dctx *DirContexts) Put(handle fuse.HandleID, dirCtx *DirContext) {
dctx.Lock()
defer dctx.Unlock()
oldCtx, found := dctx.dirCtx[handle]
if found {
oldCtx.Name = dirCtx.Name
return
}
dctx.dirCtx[handle] = dirCtx
}
func (dctx *DirContexts) Remove(handle fuse.HandleID) {
dctx.Lock()
delete(dctx.dirCtx, handle)
dctx.Unlock()
}
// Dir defines the structure of a directory
type Dir struct {
super *Super
info *proto.InodeInfo
dcache *DentryCache
dctx *DirContexts
parentIno uint64
name string
}
// Functions that Dir needs to implement
var (
_ fs.Node = (*Dir)(nil)
_ fs.NodeCreater = (*Dir)(nil)
_ fs.NodeForgetter = (*Dir)(nil)
_ fs.NodeMkdirer = (*Dir)(nil)
_ fs.NodeMknoder = (*Dir)(nil)
_ fs.NodeRemover = (*Dir)(nil)
_ fs.NodeFsyncer = (*Dir)(nil)
_ fs.NodeRequestLookuper = (*Dir)(nil)
_ fs.HandleReadDirAller = (*Dir)(nil)
_ fs.NodeRenamer = (*Dir)(nil)
_ fs.NodeSetattrer = (*Dir)(nil)
_ fs.NodeSymlinker = (*Dir)(nil)
_ fs.NodeGetxattrer = (*Dir)(nil)
_ fs.NodeListxattrer = (*Dir)(nil)
_ fs.NodeSetxattrer = (*Dir)(nil)
_ fs.NodeRemovexattrer = (*Dir)(nil)
)
// NewDir returns a new directory.
func NewDir(s *Super, i *proto.InodeInfo, pino uint64, dirName string) fs.Node {
return &Dir{
super: s,
info: i,
parentIno: pino,
name: dirName,
dctx: NewDirContexts(),
}
}
// Attr set the attributes of a directory.
func (d *Dir) Attr(ctx context.Context, a *fuse.Attr) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Attr", err, bgTime, 1)
}()
ino := d.info.Inode
info, err := d.super.InodeGet(ino)
if err != nil {
log.LogErrorf("Attr: ino(%v) err(%v)", ino, err)
return ParseError(err)
}
fillAttr(info, a)
log.LogDebugf("TRACE Attr: inode(%v)", info)
return nil
}
func (d *Dir) Release(ctx context.Context, req *fuse.ReleaseRequest) (err error) {
d.dctx.Remove(req.Handle)
return nil
}
// Create handles the create request.
func (d *Dir) Create(ctx context.Context, req *fuse.CreateRequest, resp *fuse.CreateResponse) (fs.Node, fs.Handle, error) {
start := time.Now()
bgTime := stat.BeginStat()
var err error
var newInode uint64
metric := exporter.NewTPCnt("filecreate")
fullPath := path.Join(d.getCwd(), req.Name)
defer func() {
stat.EndStat("Create", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
auditlog.LogClientOp("Create", fullPath, "nil", err, time.Since(start).Microseconds(), newInode, 0)
}()
info, err := d.super.mw.Create_ll(d.info.Inode, req.Name, proto.Mode(req.Mode.Perm()), req.Uid, req.Gid, nil, fullPath)
if err != nil {
log.LogErrorf("Create: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
return nil, nil, ParseError(err)
}
d.super.ic.Put(info)
child := NewFile(d.super, info, uint32(req.Flags&DefaultFlag), d.info.Inode, req.Name)
newInode = info.Inode
d.super.ec.OpenStream(info.Inode)
d.super.fslock.Lock()
d.super.nodeCache[info.Inode] = child
d.super.fslock.Unlock()
if d.super.keepCache {
resp.Flags |= fuse.OpenKeepCache
}
resp.EntryValid = LookupValidDuration
d.super.ic.Delete(d.info.Inode)
elapsed := time.Since(start)
log.LogDebugf("TRACE Create: parent(%v) req(%v) resp(%v) ino(%v) (%v)ns", d.info.Inode, req, resp, info.Inode, elapsed.Nanoseconds())
return child, child, nil
}
// Forget is called when the evict is invoked from the kernel.
func (d *Dir) Forget() {
bgTime := stat.BeginStat()
ino := d.info.Inode
defer func() {
stat.EndStat("Forget", nil, bgTime, 1)
log.LogDebugf("TRACE Forget: ino(%v)", ino)
}()
d.super.ic.Delete(ino)
d.super.fslock.Lock()
delete(d.super.nodeCache, ino)
d.super.fslock.Unlock()
}
// Mkdir handles the mkdir request.
func (d *Dir) Mkdir(ctx context.Context, req *fuse.MkdirRequest) (fs.Node, error) {
start := time.Now()
bgTime := stat.BeginStat()
var err error
var newInode uint64
metric := exporter.NewTPCnt("mkdir")
fullPath := path.Join(d.getCwd(), req.Name)
defer func() {
stat.EndStat("Mkdir", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
auditlog.LogClientOp("Mkdir", fullPath, "nil", err, time.Since(start).Microseconds(), newInode, 0)
}()
info, err := d.super.mw.Create_ll(d.info.Inode, req.Name, proto.Mode(os.ModeDir|req.Mode.Perm()), req.Uid, req.Gid, nil, fullPath)
if err != nil {
log.LogErrorf("Mkdir: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
return nil, ParseError(err)
}
d.super.ic.Put(info)
child := NewDir(d.super, info, d.info.Inode, req.Name)
newInode = info.Inode
d.super.fslock.Lock()
d.super.nodeCache[info.Inode] = child
d.super.fslock.Unlock()
d.super.ic.Delete(d.info.Inode)
elapsed := time.Since(start)
log.LogDebugf("TRACE Mkdir: parent(%v) req(%v) ino(%v) (%v)ns", d.info.Inode, req, info.Inode, elapsed.Nanoseconds())
return child, nil
}
// Remove handles the remove request.
func (d *Dir) Remove(ctx context.Context, req *fuse.RemoveRequest) error {
start := time.Now()
d.dcache.Delete(req.Name)
dcacheKey := d.buildDcacheKey(d.info.Inode, req.Name)
d.super.dc.Delete(dcacheKey)
bgTime := stat.BeginStat()
var err error
var deletedInode uint64
metric := exporter.NewTPCnt("remove")
fullPath := path.Join(d.getCwd(), req.Name)
defer func() {
stat.EndStat("Remove", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
auditlog.LogClientOp("Remove", fullPath, "nil", err, time.Since(start).Microseconds(), deletedInode, 0)
}()
info, err := d.super.mw.Delete_ll(d.info.Inode, req.Name, req.Dir, fullPath)
if err != nil {
log.LogErrorf("Remove: parent(%v) name(%v) err(%v)", d.info.Inode, req.Name, err)
return ParseError(err)
}
if info != nil {
deletedInode = info.Inode
}
d.super.ic.Delete(d.info.Inode)
if info != nil && info.Nlink == 0 && !proto.IsDir(info.Mode) {
d.super.orphan.Put(info.Inode)
log.LogDebugf("Remove: add to orphan inode list, ino(%v)", info.Inode)
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Remove: parent(%v) req(%v) inode(%v) (%v)ns", d.info.Inode, req, info, elapsed.Nanoseconds())
return nil
}
func (d *Dir) Fsync(ctx context.Context, req *fuse.FsyncRequest) error {
return nil
}
// Lookup handles the lookup request.
func (d *Dir) Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.LookupResponse) (fs.Node, error) {
var (
ino uint64
err error
dcachev2 bool
)
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Lookup", err, bgTime, 1)
}()
log.LogDebugf("TRACE Lookup: parent(%v) req(%v)", d.info.Inode, req)
log.LogDebugf("TRACE Lookup: parent(%v) path(%v) d.super.bcacheDir(%v)", d.info.Inode, d.getCwd(), d.super.bcacheDir)
if d.needDentrycache() {
dcachev2 = true
}
if dcachev2 {
lookupMetric := exporter.NewCounter("lookupDcache")
lookupMetric.AddWithLabels(1, map[string]string{exporter.Vol: d.super.volname})
dcacheKey := d.buildDcacheKey(d.info.Inode, req.Name)
dentryInfo := d.super.dc.Get(dcacheKey)
if dentryInfo == nil {
lookupMetric := exporter.NewCounter("lookupDcacheMiss")
lookupMetric.AddWithLabels(1, map[string]string{exporter.Vol: d.super.volname})
ino, _, err = d.super.mw.Lookup_ll(d.info.Inode, req.Name)
if err != nil {
if err != syscall.ENOENT {
log.LogErrorf("Lookup: parent(%v) name(%v) err(%v)", d.info.Inode, req.Name, err)
}
return nil, ParseError(err)
}
info := &proto.DentryInfo{
Name: dcacheKey,
Inode: ino,
}
d.super.dc.Put(info)
} else {
lookupMetric := exporter.NewCounter("lookupDcacheHit")
lookupMetric.AddWithLabels(1, map[string]string{exporter.Vol: d.super.volname})
ino = dentryInfo.Inode
}
} else {
cino, ok := d.dcache.Get(req.Name)
if !ok {
cino, _, err = d.super.mw.Lookup_ll(d.info.Inode, req.Name)
if err != nil {
if err != syscall.ENOENT {
log.LogErrorf("Lookup: parent(%v) name(%v) err(%v)", d.info.Inode, req.Name, err)
}
return nil, ParseError(err)
}
}
ino = cino
}
info, err := d.super.InodeGet(ino)
if err != nil {
log.LogErrorf("Lookup: parent(%v) name(%v) ino(%v) err(%v)", d.info.Inode, req.Name, ino, err)
dummyInodeInfo := &proto.InodeInfo{Inode: ino}
dummyChild := NewFile(d.super, dummyInodeInfo, DefaultFlag, d.info.Inode, req.Name)
return dummyChild, nil
}
mode := proto.OsMode(info.Mode)
d.super.fslock.Lock()
child, ok := d.super.nodeCache[ino]
if !ok {
if mode.IsDir() {
child = NewDir(d.super, info, d.info.Inode, req.Name)
} else {
child = NewFile(d.super, info, DefaultFlag, d.info.Inode, req.Name)
}
d.super.nodeCache[ino] = child
}
d.super.fslock.Unlock()
resp.EntryValid = LookupValidDuration
log.LogDebugf("TRACE Lookup exit: parent(%v) req(%v) cost (%d)", d.info.Inode, req, time.Since(*bgTime).Microseconds())
return child, nil
}
func (d *Dir) buildDcacheKey(inode uint64, name string) string {
return fmt.Sprintf("%v_%v", inode, name)
}
func (d *Dir) ReadDir(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) ([]fuse.Dirent, error) {
var err error
var limit uint64 = DefaultReaddirLimit
start := time.Now()
bgTime := stat.BeginStat()
// var err error
metric := exporter.NewTPCnt("readdir")
defer func() {
stat.EndStat("ReadDirLimit", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
}()
var dirCtx DirContext
if req.Offset != 0 {
dirCtx = d.dctx.GetCopy(req.Handle)
} else {
dirCtx = DirContext{}
}
children, err := d.super.mw.ReadDirLimit_ll(d.info.Inode, dirCtx.Name, limit)
if err != nil {
log.LogErrorf("readdirlimit: Readdir: ino(%v) err(%v) offset %v", d.info.Inode, err, req.Offset)
return make([]fuse.Dirent, 0), ParseError(err)
}
if req.Offset == 0 {
if len(children) == 0 {
dirents := make([]fuse.Dirent, 0, len(children))
dirents = append(dirents, fuse.Dirent{
Inode: d.info.Inode,
Type: fuse.DT_Dir,
Name: ".",
})
pid := uint64(req.Pid)
if d.info.Inode == 1 {
pid = d.info.Inode
}
dirents = append(dirents, fuse.Dirent{
Inode: pid,
Type: fuse.DT_Dir,
Name: "..",
})
return dirents, io.EOF
}
children = append([]proto.Dentry{{
Name: ".",
Inode: d.info.Inode,
Type: uint32(os.ModeDir),
}, {
Name: "..",
Inode: uint64(req.Pid),
Type: uint32(os.ModeDir),
}}, children...)
}
// skip the first one, which is already accessed
childrenNr := uint64(len(children))
if childrenNr == 0 || (dirCtx.Name != "" && childrenNr == 1) {
return make([]fuse.Dirent, 0), io.EOF
} else if childrenNr < limit {
err = io.EOF
}
if dirCtx.Name != "" {
children = children[1:]
}
/* update dirCtx */
dirCtx.Name = children[len(children)-1].Name
d.dctx.Put(req.Handle, &dirCtx)
inodes := make([]uint64, 0, len(children))
dirents := make([]fuse.Dirent, 0, len(children))
log.LogDebugf("Readdir ino(%v) path(%v) d.super.bcacheDir(%v)", d.info.Inode, d.getCwd(), d.super.bcacheDir)
var dcache *DentryCache
if !d.super.disableDcache {
dcache = NewDentryCache()
}
var dcachev2 bool
if d.needDentrycache() {
dcachev2 = true
}
for _, child := range children {
dentry := fuse.Dirent{
Inode: child.Inode,
Type: ParseType(child.Type),
Name: child.Name,
}
inodes = append(inodes, child.Inode)
dirents = append(dirents, dentry)
if dcachev2 {
info := &proto.DentryInfo{
Name: d.buildDcacheKey(d.info.Inode, child.Name),
Inode: child.Inode,
}
d.super.dc.Put(info)
} else {
dcache.Put(child.Name, child.Inode)
}
}
infos := d.super.mw.BatchInodeGet(inodes)
for _, info := range infos {
d.super.ic.Put(info)
}
d.dcache = dcache
elapsed := time.Since(start)
log.LogDebugf("TRACE ReadDir exit: ino(%v) (%v)ns %v", d.info.Inode, elapsed.Nanoseconds(), req)
return dirents, err
}
// ReadDirAll gets all the dentries in a directory and puts them into the cache.
func (d *Dir) ReadDirAll(ctx context.Context) ([]fuse.Dirent, error) {
start := time.Now()
bgTime := stat.BeginStat()
var err error
metric := exporter.NewTPCnt("readdir")
defer func() {
stat.EndStat("ReadDirAll", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
}()
// transform ReadDirAll to ReadDirLimit_ll
noMore := false
from := ""
var children []proto.Dentry
for !noMore {
batches, err := d.super.mw.ReadDirLimit_ll(d.info.Inode, from, DefaultReaddirLimit)
if err != nil {
log.LogErrorf("Readdir: ino(%v) err(%v) from(%v)", d.info.Inode, err, from)
return make([]fuse.Dirent, 0), ParseError(err)
}
batchNr := uint64(len(batches))
if batchNr == 0 || (from != "" && batchNr == 1) {
noMore = true
break
} else if batchNr < DefaultReaddirLimit {
noMore = true
}
if from != "" {
batches = batches[1:]
}
children = append(children, batches...)
from = batches[len(batches)-1].Name
}
inodes := make([]uint64, 0, len(children))
dirents := make([]fuse.Dirent, 0, len(children))
log.LogDebugf("Readdir ino(%v) path(%v) d.super.bcacheDir(%v)", d.info.Inode, d.getCwd(), d.super.bcacheDir)
var dcache *DentryCache
if !d.super.disableDcache {
dcache = NewDentryCache()
}
var dcachev2 bool
if d.needDentrycache() {
dcachev2 = true
}
for _, child := range children {
dentry := fuse.Dirent{
Inode: child.Inode,
Type: ParseType(child.Type),
Name: child.Name,
}
inodes = append(inodes, child.Inode)
dirents = append(dirents, dentry)
if dcachev2 {
info := &proto.DentryInfo{
Name: d.buildDcacheKey(d.info.Inode, child.Name),
Inode: child.Inode,
}
d.super.dc.Put(info)
} else {
dcache.Put(child.Name, child.Inode)
}
}
infos := d.super.mw.BatchInodeGet(inodes)
for _, info := range infos {
d.super.ic.Put(info)
}
d.dcache = dcache
elapsed := time.Since(start)
log.LogDebugf("TRACE ReadDirAll: ino(%v) (%v)ns", d.info.Inode, elapsed.Nanoseconds())
return dirents, nil
}
// Rename handles the rename request.
func (d *Dir) Rename(ctx context.Context, req *fuse.RenameRequest, newDir fs.Node) error {
dstDir, ok := newDir.(*Dir)
if !ok {
log.LogErrorf("Rename: NOT DIR, parent(%v) req(%v)", d.info.Inode, req)
return fuse.ENOTSUP
}
start := time.Now()
var srcInode uint64 // must exist
var dstInode uint64 // may not exist
var err error
if ino, ok := dstDir.dcache.Get(req.NewName); ok {
dstInode = ino
}
if ino, ok := d.dcache.Get(req.OldName); ok {
srcInode = ino
} else {
// will not get there
if ino, _, err := d.super.mw.Lookup_ll(d.info.Inode, req.OldName); err == nil {
srcInode = ino
}
}
d.dcache.Delete(req.OldName)
dcacheKey := d.buildDcacheKey(d.info.Inode, req.OldName)
d.super.dc.Delete(dcacheKey)
bgTime := stat.BeginStat()
metric := exporter.NewTPCnt("rename")
srcPath := path.Join(d.getCwd(), req.OldName)
dstPath := path.Join(dstDir.getCwd(), req.NewName)
defer func() {
stat.EndStat("Rename", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
d.super.fslock.Lock()
node, ok := d.super.nodeCache[srcInode]
if ok && srcInode != 0 {
if dir, ok := node.(*Dir); ok {
dir.name = req.NewName
dir.parentIno = dstDir.info.Inode
} else {
file := node.(*File)
file.name = req.NewName
file.parentIno = dstDir.info.Inode
}
}
d.super.fslock.Unlock()
auditlog.LogClientOp("Rename", srcPath, dstPath, err, time.Since(start).Microseconds(), srcInode, dstInode)
}()
// changePathMap := d.super.mw.GetChangeQuota(d.getCwd()+"/"+req.OldName, dstDir.getCwd()+"/"+req.NewName)
if d.super.mw.EnableQuota {
if !d.canRenameByQuota(dstDir, req.OldName) {
return fuse.EPERM
}
}
err = d.super.mw.Rename_ll(d.info.Inode, req.OldName, dstDir.info.Inode, req.NewName, srcPath, dstPath, true)
if err != nil {
log.LogErrorf("Rename: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
return ParseError(err)
}
// if len(changePathMap) != 0 {
// d.super.mw.BatchModifyQuotaPath(changePathMap)
// }
d.super.ic.Delete(d.info.Inode)
d.super.ic.Delete(dstDir.info.Inode)
elapsed := time.Since(start)
log.LogDebugf("TRACE Rename: SrcParent(%v) OldName(%v) DstParent(%v) NewName(%v) (%v)ns", d.info.Inode, req.OldName, dstDir.info.Inode, req.NewName, elapsed.Nanoseconds())
return nil
}
// Setattr handles the setattr request.
func (d *Dir) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Setattr", err, bgTime, 1)
}()
ino := d.info.Inode
start := time.Now()
info, err := d.super.InodeGet(ino)
if err != nil {
log.LogErrorf("Setattr: ino(%v) err(%v)", ino, err)
return ParseError(err)
}
if valid := setattr(info, req); valid != 0 {
err = d.super.mw.Setattr(ino, valid, info.Mode, info.Uid, info.Gid, info.AccessTime.Unix(),
info.ModifyTime.Unix())
if err != nil {
d.super.ic.Delete(ino)
return ParseError(err)
}
}
fillAttr(info, &resp.Attr)
elapsed := time.Since(start)
log.LogDebugf("TRACE Setattr: ino(%v) req(%v) inodeSize(%v) (%v)ns", ino, req, info.Size, elapsed.Nanoseconds())
return nil
}
func (d *Dir) Mknod(ctx context.Context, req *fuse.MknodRequest) (fs.Node, error) {
if req.Rdev != 0 {
return nil, fuse.ENOSYS
}
start := time.Now()
bgTime := stat.BeginStat()
var err error
metric := exporter.NewTPCnt("mknod")
defer func() {
stat.EndStat("Mknod", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
}()
fullPath := path.Join(d.getCwd(), req.Name)
info, err := d.super.mw.Create_ll(d.info.Inode, req.Name, proto.Mode(req.Mode), req.Uid, req.Gid, nil, fullPath)
if err != nil {
log.LogErrorf("Mknod: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
return nil, ParseError(err)
}
d.super.ic.Put(info)
child := NewFile(d.super, info, DefaultFlag, d.info.Inode, req.Name)
d.super.fslock.Lock()
d.super.nodeCache[info.Inode] = child
d.super.fslock.Unlock()
elapsed := time.Since(start)
log.LogDebugf("TRACE Mknod: parent(%v) req(%v) ino(%v) (%v)ns", d.info.Inode, req, info.Inode, elapsed.Nanoseconds())
return child, nil
}
// Symlink handles the symlink request.
func (d *Dir) Symlink(ctx context.Context, req *fuse.SymlinkRequest) (fs.Node, error) {
parentIno := d.info.Inode
start := time.Now()
bgTime := stat.BeginStat()
var err error
metric := exporter.NewTPCnt("symlink")
defer func() {
stat.EndStat("Symlink", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
}()
fullPath := path.Join(d.getCwd(), req.NewName)
info, err := d.super.mw.Create_ll(parentIno, req.NewName, proto.Mode(os.ModeSymlink|os.ModePerm), req.Uid, req.Gid, []byte(req.Target), fullPath)
if err != nil {
log.LogErrorf("Symlink: parent(%v) NewName(%v) err(%v)", parentIno, req.NewName, err)
return nil, ParseError(err)
}
d.super.ic.Put(info)
child := NewFile(d.super, info, DefaultFlag, d.info.Inode, req.NewName)
d.super.fslock.Lock()
d.super.nodeCache[info.Inode] = child
d.super.fslock.Unlock()
elapsed := time.Since(start)
log.LogDebugf("TRACE Symlink: parent(%v) req(%v) ino(%v) (%v)ns", parentIno, req, info.Inode, elapsed.Nanoseconds())
return child, nil
}
// Link handles the link request.
func (d *Dir) Link(ctx context.Context, req *fuse.LinkRequest, old fs.Node) (fs.Node, error) {
var oldInode *proto.InodeInfo
switch old := old.(type) {
case *File:
oldInode = old.info
default:
return nil, fuse.EPERM
}
if !proto.IsRegular(oldInode.Mode) {
log.LogErrorf("Link: not regular, parent(%v) name(%v) ino(%v) mode(%v)", d.info.Inode, req.NewName, oldInode.Inode, proto.OsMode(oldInode.Mode))
return nil, fuse.EPERM
}
start := time.Now()
bgTime := stat.BeginStat()
var err error
metric := exporter.NewTPCnt("link")
defer func() {
stat.EndStat("Link", err, bgTime, 1)
metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
}()
fullPath := path.Join(d.getCwd(), req.NewName)
info, err := d.super.mw.Link(d.info.Inode, req.NewName, oldInode.Inode, fullPath)
if err != nil {
log.LogErrorf("Link: parent(%v) name(%v) ino(%v) err(%v)", d.info.Inode, req.NewName, oldInode.Inode, err)
return nil, ParseError(err)
}
d.super.ic.Put(info)
d.super.fslock.Lock()
newFile, ok := d.super.nodeCache[info.Inode]
if !ok {
newFile = NewFile(d.super, info, DefaultFlag, d.info.Inode, req.NewName)
d.super.nodeCache[info.Inode] = newFile
}
d.super.fslock.Unlock()
elapsed := time.Since(start)
log.LogDebugf("TRACE Link: parent(%v) name(%v) ino(%v) (%v)ns", d.info.Inode, req.NewName, info.Inode, elapsed.Nanoseconds())
return newFile, nil
}
// Getxattr has not been implemented yet.
func (d *Dir) Getxattr(ctx context.Context, req *fuse.GetxattrRequest, resp *fuse.GetxattrResponse) error {
if !d.super.enableXattr {
return fuse.ENOSYS
}
ino := d.info.Inode
name := req.Name
size := req.Size
pos := req.Position
var value []byte
var info *proto.XAttrInfo
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Getxattr", err, bgTime, 1)
}()
if name == meta.SummaryKey {
if !d.super.mw.EnableSummary {
return fuse.ENOSYS
}
var summaryInfo meta.SummaryInfo
cacheSummaryInfo := d.super.sc.Get(ino)
if cacheSummaryInfo != nil {
summaryInfo = *cacheSummaryInfo
} else {
summaryInfo, err = d.super.mw.GetSummary_ll(ino, 20)
if err != nil {
log.LogErrorf("GetXattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
d.super.sc.Put(ino, &summaryInfo)
}
files := summaryInfo.Files
subdirs := summaryInfo.Subdirs
fbytes := summaryInfo.Fbytes
summaryStr := "Files:" + strconv.FormatInt(int64(files), 10) + "," +
"Dirs:" + strconv.FormatInt(int64(subdirs), 10) + "," +
"Bytes:" + strconv.FormatInt(int64(fbytes), 10)
value = []byte(summaryStr)
} else {
info, err = d.super.mw.XAttrGet_ll(ino, name)
if err != nil {
log.LogErrorf("GetXattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
value = info.Get(name)
}
if pos > 0 {
value = value[pos:]
}
if size > 0 && size < uint32(len(value)) {
value = value[:size]
}
resp.Xattr = value
log.LogDebugf("TRACE GetXattr: ino(%v) name(%v)", ino, name)
return nil
}
// Listxattr has not been implemented yet.
func (d *Dir) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error {
if !d.super.enableXattr {
return fuse.ENOSYS
}
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Getxattr", err, bgTime, 1)
}()
ino := d.info.Inode
_ = req.Size // ignore currently
_ = req.Position // ignore currently
keys, err := d.super.mw.XAttrsList_ll(ino)
if err != nil {
log.LogErrorf("ListXattr: ino(%v) err(%v)", ino, err)
return ParseError(err)
}
for _, key := range keys {
resp.Append(key)
}
log.LogDebugf("TRACE Listxattr: ino(%v)", ino)
return nil
}
// Setxattr has not been implemented yet.
func (d *Dir) Setxattr(ctx context.Context, req *fuse.SetxattrRequest) error {
if !d.super.enableXattr {
return fuse.ENOSYS
}
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Setxattr", err, bgTime, 1)
}()
ino := d.info.Inode
name := req.Name
value := req.Xattr
if name == meta.SummaryKey {
log.LogErrorf("Set 'DirStat' is not supported.")
return fuse.ENOSYS
}
// TODO: implement flag to improve compatible (Mofei Zhang)
if err = d.super.mw.XAttrSet_ll(ino, []byte(name), []byte(value)); err != nil {
log.LogErrorf("Setxattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
log.LogDebugf("TRACE Setxattr: ino(%v) name(%v)", ino, name)
return nil
}
// Removexattr has not been implemented yet.
func (d *Dir) Removexattr(ctx context.Context, req *fuse.RemovexattrRequest) error {
if !d.super.enableXattr {
return fuse.ENOSYS
}
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Removexattr", err, bgTime, 1)
}()
ino := d.info.Inode
name := req.Name
if name == meta.SummaryKey {
log.LogErrorf("Remove 'DirStat' is not supported.")
return fuse.ENOSYS
}
if err = d.super.mw.XAttrDel_ll(ino, name); err != nil {
log.LogErrorf("Removexattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
log.LogDebugf("TRACE RemoveXattr: ino(%v) name(%v)", ino, name)
return nil
}
func (d *Dir) getCwd() string {
dirPath := ""
if d.info.Inode == d.super.rootIno {
return "/"
}
curIno := d.info.Inode
for curIno != d.super.rootIno {
d.super.fslock.Lock()
node, ok := d.super.nodeCache[curIno]
d.super.fslock.Unlock()
if !ok {
log.LogErrorf("Get node cache failed: ino(%v)", curIno)
return "unknown" + dirPath
}
curDir, ok := node.(*Dir)
if !ok {
log.LogErrorf("Type error: Can not convert node -> *Dir, ino(%v)", curDir.parentIno)
return "unknown" + dirPath
}
dirPath = "/" + curDir.name + dirPath
curIno = curDir.parentIno
}
return dirPath
}
func (d *Dir) needDentrycache() bool {
return !DisableMetaCache && d.super.bcacheDir != "" && strings.HasPrefix(d.getCwd(), d.super.bcacheDir)
}
func dentryExpired(info *proto.DentryInfo) bool {
return time.Now().UnixNano() > info.Expiration()
}
func dentrySetExpiration(info *proto.DentryInfo, t time.Duration) {
info.SetExpiration(time.Now().Add(t).UnixNano())
}
func (d *Dir) canRenameByQuota(dstDir *Dir, srcName string) bool {
fullPaths := d.super.mw.GetQuotaFullPaths()
if len(fullPaths) == 0 {
return true
}
var srcPath string
if d.getCwd() == "/" {
srcPath = "/" + srcName
} else {
srcPath = d.getCwd() + "/" + srcName
}
for _, fullPath := range fullPaths {
log.LogDebugf("srcPath [%v] fullPath[%v].", srcPath, fullPath)
if proto.IsAncestor(srcPath, fullPath) {
return false
}
}
return true
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"context"
"fmt"
"io"
"path"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
"github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/blobstore"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
// File defines the structure of a file.
type File struct {
super *Super
info *proto.InodeInfo
idle int32
parentIno uint64
name string
sync.RWMutex
fReader *blobstore.Reader
fWriter *blobstore.Writer
}
// Functions that File needs to implement
var (
_ fs.Node = (*File)(nil)
_ fs.Handle = (*File)(nil)
_ fs.NodeForgetter = (*File)(nil)
_ fs.NodeOpener = (*File)(nil)
_ fs.HandleReleaser = (*File)(nil)
_ fs.HandleReader = (*File)(nil)
_ fs.HandleWriter = (*File)(nil)
_ fs.HandleFlusher = (*File)(nil)
_ fs.NodeFsyncer = (*File)(nil)
_ fs.NodeSetattrer = (*File)(nil)
_ fs.NodeReadlinker = (*File)(nil)
_ fs.NodeGetxattrer = (*File)(nil)
_ fs.NodeListxattrer = (*File)(nil)
_ fs.NodeSetxattrer = (*File)(nil)
_ fs.NodeRemovexattrer = (*File)(nil)
)
// NewFile returns a new file.
func NewFile(s *Super, i *proto.InodeInfo, flag uint32, pino uint64, filename string) fs.Node {
if proto.IsCold(s.volType) {
var (
fReader *blobstore.Reader
fWriter *blobstore.Writer
clientConf blobstore.ClientConfig
)
clientConf = blobstore.ClientConfig{
VolName: s.volname,
VolType: s.volType,
Ino: i.Inode,
BlockSize: s.EbsBlockSize,
Bc: s.bc,
Mw: s.mw,
Ec: s.ec,
Ebsc: s.ebsc,
EnableBcache: s.enableBcache,
WConcurrency: s.writeThreads,
ReadConcurrency: s.readThreads,
CacheAction: s.CacheAction,
FileCache: false,
FileSize: i.Size,
CacheThreshold: s.CacheThreshold,
}
log.LogDebugf("Trace NewFile:flag(%v). clientConf(%v)", flag, clientConf)
switch flag {
case syscall.O_RDONLY:
fReader = blobstore.NewReader(clientConf)
case syscall.O_WRONLY:
fWriter = blobstore.NewWriter(clientConf)
case syscall.O_RDWR:
fReader = blobstore.NewReader(clientConf)
fWriter = blobstore.NewWriter(clientConf)
default:
// no thing
}
log.LogDebugf("Trace NewFile:fReader(%v) fWriter(%v) ", fReader, fWriter)
return &File{super: s, info: i, fWriter: fWriter, fReader: fReader, parentIno: pino, name: filename}
}
return &File{super: s, info: i, parentIno: pino, name: filename}
}
// get file parentPath
func (f *File) getParentPath() string {
if f.parentIno == f.super.rootIno {
return "/"
}
f.super.fslock.Lock()
node, ok := f.super.nodeCache[f.parentIno]
f.super.fslock.Unlock()
if !ok {
log.LogErrorf("Get node cache failed: ino(%v)", f.parentIno)
return "unknown"
}
parentDir, ok := node.(*Dir)
if !ok {
log.LogErrorf("Type error: Can not convert node -> *Dir, ino(%v)", f.parentIno)
return "unknown"
}
return parentDir.getCwd()
}
// Attr sets the attributes of a file.
func (f *File) Attr(ctx context.Context, a *fuse.Attr) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Attr", err, bgTime, 1)
}()
ino := f.info.Inode
info, err := f.super.InodeGet(ino)
if err != nil {
log.LogErrorf("Attr: ino(%v) err(%v)", ino, err)
if err == fuse.ENOENT {
a.Inode = ino
return nil
}
return ParseError(err)
}
fillAttr(info, a)
a.ParentIno = f.parentIno
fileSize, gen := f.fileSizeVersion2(ino)
log.LogDebugf("Attr: ino(%v) fileSize(%v) gen(%v) inode.gen(%v)", ino, fileSize, gen, info.Generation)
if gen >= info.Generation {
a.Size = uint64(fileSize)
}
if proto.IsSymlink(info.Mode) {
a.Size = uint64(len(info.Target))
}
log.LogDebugf("TRACE Attr: inode(%v) attr(%v)", info, a)
return nil
}
// Forget evicts the inode of the current file. This can only happen when the inode is on the orphan list.
func (f *File) Forget() {
var err error
bgTime := stat.BeginStat()
ino := f.info.Inode
defer func() {
stat.EndStat("Forget", err, bgTime, 1)
log.LogDebugf("TRACE Forget: ino(%v)", ino)
}()
//TODO:why cannot close fwriter
//log.LogErrorf("TRACE Forget: ino(%v)", ino)
//if f.fWriter != nil {
// f.fWriter.Close()
//}
if DisableMetaCache {
f.super.ic.Delete(ino)
f.super.fslock.Lock()
delete(f.super.nodeCache, ino)
f.super.fslock.Unlock()
if err := f.super.ec.EvictStream(ino); err != nil {
log.LogWarnf("Forget: stream not ready to evict, ino(%v) err(%v)", ino, err)
return
}
}
if !f.super.orphan.Evict(ino) {
return
}
fullPath := f.getParentPath() + f.name
if err := f.super.mw.Evict(ino, fullPath); err != nil {
log.LogWarnf("Forget Evict: ino(%v) err(%v)", ino, err)
}
}
// Open handles the open request.
func (f *File) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenResponse) (handle fs.Handle, err error) {
bgTime := stat.BeginStat()
var needBCache bool
defer func() {
stat.EndStat("Open", err, bgTime, 1)
}()
ino := f.info.Inode
log.LogDebugf("TRACE open ino(%v) info(%v)", ino, f.info)
start := time.Now()
if f.super.bcacheDir != "" && !f.filterFilesSuffix(f.super.bcacheFilterFiles) {
parentPath := f.getParentPath()
if parentPath != "" && !strings.HasSuffix(parentPath, "/") {
parentPath = parentPath + "/"
}
log.LogDebugf("TRACE open ino(%v) parentPath(%v)", ino, parentPath)
if strings.HasPrefix(parentPath, f.super.bcacheDir) {
needBCache = true
}
}
if needBCache {
f.super.ec.OpenStreamWithCache(ino, needBCache)
} else {
f.super.ec.OpenStream(ino)
}
log.LogDebugf("TRACE open ino(%v) f.super.bcacheDir(%v) needBCache(%v)", ino, f.super.bcacheDir, needBCache)
f.super.ec.RefreshExtentsCache(ino)
if f.super.keepCache && resp != nil {
resp.Flags |= fuse.OpenKeepCache
}
if proto.IsCold(f.super.volType) {
log.LogDebugf("TRANCE open ino(%v) info(%v)", ino, f.info)
fileSize, _ := f.fileSizeVersion2(ino)
clientConf := blobstore.ClientConfig{
VolName: f.super.volname,
VolType: f.super.volType,
BlockSize: f.super.EbsBlockSize,
Ino: f.info.Inode,
Bc: f.super.bc,
Mw: f.super.mw,
Ec: f.super.ec,
Ebsc: f.super.ebsc,
EnableBcache: f.super.enableBcache,
WConcurrency: f.super.writeThreads,
ReadConcurrency: f.super.readThreads,
CacheAction: f.super.CacheAction,
FileCache: false,
FileSize: uint64(fileSize),
CacheThreshold: f.super.CacheThreshold,
}
f.fWriter.FreeCache()
switch req.Flags & 0x0f {
case syscall.O_RDONLY:
f.fReader = blobstore.NewReader(clientConf)
f.fWriter = nil
case syscall.O_WRONLY:
f.fWriter = blobstore.NewWriter(clientConf)
f.fReader = nil
case syscall.O_RDWR:
f.fReader = blobstore.NewReader(clientConf)
f.fWriter = blobstore.NewWriter(clientConf)
default:
f.fWriter = blobstore.NewWriter(clientConf)
f.fReader = nil
}
log.LogDebugf("TRACE file open,ino(%v) req.Flags(%v) reader(%v) writer(%v)", ino, req.Flags, f.fReader, f.fWriter)
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Open: ino(%v) req(%v) resp(%v) (%v)ns", ino, req, resp, elapsed.Nanoseconds())
return f, nil
}
// Release handles the release request.
func (f *File) Release(ctx context.Context, req *fuse.ReleaseRequest) (err error) {
ino := f.info.Inode
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Release", err, bgTime, 1)
log.LogInfof("action[Release] %v", f.fWriter)
f.fWriter.FreeCache()
if DisableMetaCache {
f.super.ic.Delete(ino)
}
}()
log.LogDebugf("TRACE Release enter: ino(%v) req(%v)", ino, req)
start := time.Now()
//log.LogErrorf("TRACE Release close stream: ino(%v) req(%v)", ino, req)
//if f.fWriter != nil {
// f.fWriter.Close()
//}
err = f.super.ec.CloseStream(ino)
if err != nil {
log.LogErrorf("Release: close writer failed, ino(%v) req(%v) err(%v)", ino, req, err)
return ParseError(err)
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Release: ino(%v) req(%v) (%v)ns", ino, req, elapsed.Nanoseconds())
return nil
}
// Read handles the read request.
func (f *File) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Read", err, bgTime, 1)
stat.StatBandWidth("Read", uint32(req.Size))
}()
log.LogDebugf("TRACE Read enter: ino(%v) offset(%v) reqsize(%v) req(%v)", f.info.Inode, req.Offset, req.Size, req)
start := time.Now()
metric := exporter.NewTPCnt("fileread")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: f.super.volname})
}()
var size int
if proto.IsHot(f.super.volType) {
size, err = f.super.ec.Read(f.info.Inode, resp.Data[fuse.OutHeaderSize:], int(req.Offset), req.Size)
} else {
size, err = f.fReader.Read(ctx, resp.Data[fuse.OutHeaderSize:], int(req.Offset), req.Size)
}
if err != nil && err != io.EOF {
msg := fmt.Sprintf("Read: ino(%v) req(%v) err(%v) size(%v)", f.info.Inode, req, err, size)
f.super.handleError("Read", msg)
errMetric := exporter.NewCounter("fileReadFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "EIO"})
return ParseError(err)
}
if size > req.Size {
msg := fmt.Sprintf("Read: read size larger than request size, ino(%v) req(%v) size(%v)", f.info.Inode, req, size)
f.super.handleError("Read", msg)
errMetric := exporter.NewCounter("fileReadFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "ERANGE"})
return fuse.ERANGE
}
if size > 0 {
resp.Data = resp.Data[:size+fuse.OutHeaderSize]
} else if size <= 0 {
resp.Data = resp.Data[:fuse.OutHeaderSize]
log.LogWarnf("Read: ino(%v) offset(%v) reqsize(%v) req(%v) size(%v)", f.info.Inode, req.Offset, req.Size, req, size)
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Read: ino(%v) offset(%v) reqsize(%v) req(%v) size(%v) (%v)ns", f.info.Inode, req.Offset, req.Size, req, size, elapsed.Nanoseconds())
return nil
}
// Write handles the write request.
func (f *File) Write(ctx context.Context, req *fuse.WriteRequest, resp *fuse.WriteResponse) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Write", err, bgTime, 1)
stat.StatBandWidth("Write", uint32(len(req.Data)))
}()
ino := f.info.Inode
reqlen := len(req.Data)
log.LogDebugf("TRACE Write enter: ino(%v) offset(%v) len(%v) flags(%v) fileflags(%v) quotaIds(%v) req(%v)",
ino, req.Offset, reqlen, req.Flags, req.FileFlags, f.info.QuotaInfos, req)
if proto.IsHot(f.super.volType) {
filesize, _ := f.fileSize(ino)
if req.Offset > int64(filesize) && reqlen == 1 && req.Data[0] == 0 {
// workaround: posix_fallocate would write 1 byte if fallocate is not supported.
fullPath := path.Join(f.getParentPath(), f.name)
err = f.super.ec.Truncate(f.super.mw, f.parentIno, ino, int(req.Offset)+reqlen, fullPath)
if err == nil {
resp.Size = reqlen
}
log.LogDebugf("fallocate: ino(%v) origFilesize(%v) req(%v) err(%v)", f.info.Inode, filesize, req, err)
return
}
}
defer func() {
f.super.ic.Delete(ino)
}()
var waitForFlush bool
var flags int
if isDirectIOEnabled(req.FileFlags) || (req.FileFlags&fuse.OpenSync != 0) {
waitForFlush = true
if f.super.enSyncWrite {
flags |= proto.FlagsSyncWrite
}
if proto.IsCold(f.super.volType) {
waitForFlush = false
flags |= proto.FlagsSyncWrite
}
}
if req.FileFlags&fuse.OpenAppend != 0 || proto.IsCold(f.super.volType) {
flags |= proto.FlagsAppend
}
start := time.Now()
metric := exporter.NewTPCnt("filewrite")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: f.super.volname})
}()
checkFunc := func() error {
if !f.super.mw.EnableQuota {
return nil
}
if ok := f.super.ec.UidIsLimited(req.Uid); ok {
return ParseError(syscall.ENOSPC)
}
var quotaIds []uint32
for quotaId := range f.info.QuotaInfos {
quotaIds = append(quotaIds, quotaId)
}
if limited := f.super.mw.IsQuotaLimited(quotaIds); limited {
return ParseError(syscall.ENOSPC)
}
return nil
}
var size int
if proto.IsHot(f.super.volType) {
f.super.ec.GetStreamer(ino).SetParentInode(f.parentIno)
if size, err = f.super.ec.Write(ino, int(req.Offset), req.Data, flags, checkFunc); err == ParseError(syscall.ENOSPC) {
return
}
} else {
atomic.StoreInt32(&f.idle, 0)
size, err = f.fWriter.Write(ctx, int(req.Offset), req.Data, flags)
}
if err != nil {
msg := fmt.Sprintf("Write: ino(%v) offset(%v) len(%v) err(%v)", ino, req.Offset, reqlen, err)
f.super.handleError("Write", msg)
errMetric := exporter.NewCounter("fileWriteFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "EIO"})
if err == syscall.EOPNOTSUPP {
return fuse.ENOTSUP
}
return fuse.EIO
}
resp.Size = size
if size != reqlen {
log.LogErrorf("Write: ino(%v) offset(%v) len(%v) size(%v)", ino, req.Offset, reqlen, size)
}
// only hot volType need to wait flush
if waitForFlush {
err = f.super.ec.Flush(ino)
if err != nil {
msg := fmt.Sprintf("Write: failed to wait for flush, ino(%v) offset(%v) len(%v) err(%v) req(%v)", ino, req.Offset, reqlen, err, req)
f.super.handleError("Wrtie", msg)
errMetric := exporter.NewCounter("fileWriteFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "EIO"})
return ParseError(err)
}
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Write: ino(%v) offset(%v) len(%v) flags(%v) fileflags(%v) req(%v) (%v)ns ",
ino, req.Offset, reqlen, req.Flags, req.FileFlags, req, elapsed.Nanoseconds())
return nil
}
// Flush only when fsyncOnClose is enabled.
func (f *File) Flush(ctx context.Context, req *fuse.FlushRequest) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Flush", err, bgTime, 1)
}()
if !f.super.fsyncOnClose {
return fuse.ENOSYS
}
log.LogDebugf("TRACE Flush enter: ino(%v)", f.info.Inode)
start := time.Now()
metric := exporter.NewTPCnt("filesync")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: f.super.volname})
}()
if proto.IsHot(f.super.volType) {
err = f.super.ec.Flush(f.info.Inode)
} else {
f.Lock()
err = f.fWriter.Flush(f.info.Inode, ctx)
f.Unlock()
}
log.LogDebugf("TRACE Flush: ino(%v) err(%v)", f.info.Inode, err)
if err != nil {
msg := fmt.Sprintf("Flush: ino(%v) err(%v)", f.info.Inode, err)
f.super.handleError("Flush", msg)
log.LogErrorf("TRACE Flush err: ino(%v) err(%v)", f.info.Inode, err)
return ParseError(err)
}
if DisableMetaCache {
f.super.ic.Delete(f.info.Inode)
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Flush: ino(%v) (%v)ns", f.info.Inode, elapsed.Nanoseconds())
return nil
}
// Fsync hanldes the fsync request.
func (f *File) Fsync(ctx context.Context, req *fuse.FsyncRequest) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Fsync", err, bgTime, 1)
}()
log.LogDebugf("TRACE Fsync enter: ino(%v)", f.info.Inode)
start := time.Now()
if proto.IsHot(f.super.volType) {
err = f.super.ec.Flush(f.info.Inode)
} else {
err = f.fWriter.Flush(f.info.Inode, ctx)
}
if err != nil {
msg := fmt.Sprintf("Fsync: ino(%v) err(%v)", f.info.Inode, err)
f.super.handleError("Fsync", msg)
return ParseError(err)
}
f.super.ic.Delete(f.info.Inode)
elapsed := time.Since(start)
log.LogDebugf("TRACE Fsync: ino(%v) (%v)ns", f.info.Inode, elapsed.Nanoseconds())
return nil
}
// Setattr handles the setattr request.
func (f *File) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Setattr", err, bgTime, 1)
}()
ino := f.info.Inode
start := time.Now()
if req.Valid.Size() && proto.IsHot(f.super.volType) {
// when use trunc param in open request through nfs client and mount on cfs mountPoint, cfs client may not recv open message but only setAttr,
// the streamer may not open and cause io error finally,so do a open no matter the stream be opened or not
if err := f.super.ec.OpenStream(ino); err != nil {
log.LogErrorf("Setattr: OpenStream ino(%v) size(%v) err(%v)", ino, req.Size, err)
return ParseError(err)
}
defer f.super.ec.CloseStream(ino)
if err := f.super.ec.Flush(ino); err != nil {
log.LogErrorf("Setattr: truncate wait for flush ino(%v) size(%v) err(%v)", ino, req.Size, err)
return ParseError(err)
}
fullPath := path.Join(f.getParentPath(), f.name)
if err := f.super.ec.Truncate(f.super.mw, f.parentIno, ino, int(req.Size), fullPath); err != nil {
log.LogErrorf("Setattr: truncate ino(%v) size(%v) err(%v)", ino, req.Size, err)
return ParseError(err)
}
f.super.ic.Delete(ino)
f.super.ec.RefreshExtentsCache(ino)
}
info, err := f.super.InodeGet(ino)
if err != nil {
log.LogErrorf("Setattr: InodeGet failed, ino(%v) err(%v)", ino, err)
return ParseError(err)
}
if req.Valid.Size() && proto.IsHot(f.super.volType) {
if req.Size != info.Size {
log.LogWarnf("Setattr: truncate ino(%v) reqSize(%v) inodeSize(%v)", ino, req.Size, info.Size)
}
}
if valid := setattr(info, req); valid != 0 {
err = f.super.mw.Setattr(ino, valid, info.Mode, info.Uid, info.Gid, info.AccessTime.Unix(),
info.ModifyTime.Unix())
if err != nil {
f.super.ic.Delete(ino)
return ParseError(err)
}
}
fillAttr(info, &resp.Attr)
elapsed := time.Since(start)
log.LogDebugf("TRACE Setattr: ino(%v) req(%v) (%v)ns", ino, req, elapsed.Nanoseconds())
return nil
}
// Readlink handles the readlink request.
func (f *File) Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (string, error) {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Readlink", err, bgTime, 1)
}()
ino := f.info.Inode
info, err := f.super.InodeGet(ino)
if err != nil {
log.LogErrorf("Readlink: ino(%v) err(%v)", ino, err)
return "", ParseError(err)
}
log.LogDebugf("TRACE Readlink: ino(%v) target(%v)", ino, string(info.Target))
return string(info.Target), nil
}
// Getxattr has not been implemented yet.
func (f *File) Getxattr(ctx context.Context, req *fuse.GetxattrRequest, resp *fuse.GetxattrResponse) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Getxattr", err, bgTime, 1)
}()
if !f.super.enableXattr {
return fuse.ENOSYS
}
ino := f.info.Inode
name := req.Name
size := req.Size
pos := req.Position
info, err := f.super.mw.XAttrGet_ll(ino, name)
if err != nil {
log.LogErrorf("GetXattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
value := info.Get(name)
if pos > 0 {
value = value[pos:]
}
if size > 0 && size < uint32(len(value)) {
value = value[:size]
}
resp.Xattr = value
log.LogDebugf("TRACE GetXattr: ino(%v) name(%v)", ino, name)
return nil
}
// Listxattr has not been implemented yet.
func (f *File) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Listxattr", err, bgTime, 1)
}()
if !f.super.enableXattr {
return fuse.ENOSYS
}
ino := f.info.Inode
_ = req.Size // ignore currently
_ = req.Position // ignore currently
keys, err := f.super.mw.XAttrsList_ll(ino)
if err != nil {
log.LogErrorf("ListXattr: ino(%v) err(%v)", ino, err)
return ParseError(err)
}
for _, key := range keys {
resp.Append(key)
}
log.LogDebugf("TRACE Listxattr: ino(%v)", ino)
return nil
}
// Setxattr has not been implemented yet.
func (f *File) Setxattr(ctx context.Context, req *fuse.SetxattrRequest) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Setxattr", err, bgTime, 1)
}()
if !f.super.enableXattr {
return fuse.ENOSYS
}
ino := f.info.Inode
name := req.Name
value := req.Xattr
// TODO: implement flag to improve compatible (Mofei Zhang)
if err = f.super.mw.XAttrSet_ll(ino, []byte(name), []byte(value)); err != nil {
log.LogErrorf("Setxattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
log.LogDebugf("TRACE Setxattr: ino(%v) name(%v)", ino, name)
return nil
}
// Removexattr has not been implemented yet.
func (f *File) Removexattr(ctx context.Context, req *fuse.RemovexattrRequest) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("Removexattr", err, bgTime, 1)
}()
if !f.super.enableXattr {
return fuse.ENOSYS
}
ino := f.info.Inode
name := req.Name
if err = f.super.mw.XAttrDel_ll(ino, name); err != nil {
log.LogErrorf("Removexattr: ino(%v) name(%v) err(%v)", ino, name, err)
return ParseError(err)
}
log.LogDebugf("TRACE RemoveXattr: ino(%v) name(%v)", ino, name)
return nil
}
func (f *File) fileSize(ino uint64) (size int, gen uint64) {
size, gen, valid := f.super.ec.FileSize(ino)
if !valid {
if info, err := f.super.InodeGet(ino); err == nil {
size = int(info.Size)
gen = info.Generation
}
}
log.LogDebugf("TRANCE fileSize: ino(%v) fileSize(%v) gen(%v) valid(%v)", ino, size, gen, valid)
return
}
func (f *File) fileSizeVersion2(ino uint64) (size int, gen uint64) {
size, gen, valid := f.super.ec.FileSize(ino)
if proto.IsCold(f.super.volType) {
valid = false
}
if !valid {
if info, err := f.super.InodeGet(ino); err == nil {
size = int(info.Size)
if f.fWriter != nil {
cacheSize := f.fWriter.CacheFileSize()
if cacheSize > size {
size = cacheSize
}
}
gen = info.Generation
}
}
log.LogDebugf("TRACE fileSizeVersion2: ino(%v) fileSize(%v) gen(%v) valid(%v)", ino, size, gen, valid)
return
}
// return true mean this file will not cache in block cache
func (f *File) filterFilesSuffix(filterFiles string) bool {
if f.name == "" {
log.LogWarnf("this file inode[%v], name is nil", f.info)
return true
}
if filterFiles == "" {
return false
}
suffixs := strings.Split(filterFiles, ";")
for _, suffix := range suffixs {
//.py means one type of file
suffix = "." + suffix
if suffix != "." && strings.Contains(f.name, suffix) {
log.LogDebugf("fileName:%s,filter:%s,suffix:%s,suffixs:%v", f.name, filterFiles, suffix, suffixs)
return true
}
}
return false
}
//go:build gofuzz
// +build gofuzz
// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package fs
import (
fuzz "github.com/AdaLogics/go-fuzz-headers"
"github.com/cubefs/cubefs/proto"
)
type NewFileParam struct {
Pino uint64
Flag uint32
FileName string
Info *proto.InodeInfo
Super *Super
}
type NewDirParam struct {
Pino uint64
FileName string
Info *proto.InodeInfo
Super *Super
}
func FuzzNewFile(data []byte) int {
f := fuzz.NewConsumer(data)
param := NewFileParam{}
err := f.GenerateStruct(¶m)
if err != nil {
return 0
}
file := NewFile(param.Super, param.Info, param.Flag, param.Pino, param.FileName)
if file == nil {
return 0
}
return 1
}
func FuzzNewDir(data []byte) int {
f := fuzz.NewConsumer(data)
param := NewDirParam{}
err := f.GenerateStruct(¶m)
if err != nil {
return 0
}
dir := NewDir(param.Super, param.Info, param.Pino, param.FileName)
if dir == nil {
return 0
}
return 1
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"container/list"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
const (
// MinInodeCacheEvictNum is used in the foreground eviction.
// When clearing the inodes from the cache, it stops as soon as 10 inodes have been evicted.
MinInodeCacheEvictNum = 10
// MaxInodeCacheEvictNum is used in the back ground. We can evict 200000 inodes at max.
MaxInodeCacheEvictNum = 200000
BgEvictionInterval = 2 * time.Minute
)
// InodeCache defines the structure of the inode cache.
type InodeCache struct {
sync.RWMutex
cache map[uint64]*list.Element
lruList *list.List
expiration time.Duration
maxElements int
}
// NewInodeCache returns a new inode cache.
func NewInodeCache(exp time.Duration, maxElements int) *InodeCache {
ic := &InodeCache{
cache: make(map[uint64]*list.Element),
lruList: list.New(),
expiration: exp,
maxElements: maxElements,
}
go ic.backgroundEviction()
return ic
}
// Put puts the given inode info into the inode cache.
func (ic *InodeCache) Put(info *proto.InodeInfo) {
ic.Lock()
old, ok := ic.cache[info.Inode]
if ok {
ic.lruList.Remove(old)
delete(ic.cache, info.Inode)
}
if ic.lruList.Len() >= ic.maxElements {
ic.evict(true)
}
inodeSetExpiration(info, ic.expiration)
element := ic.lruList.PushFront(info)
ic.cache[info.Inode] = element
ic.Unlock()
// log.LogDebugf("InodeCache put inode: inode(%v)", info.Inode)
}
// Get returns the inode info based on the given inode number.
func (ic *InodeCache) Get(ino uint64) *proto.InodeInfo {
ic.RLock()
element, ok := ic.cache[ino]
if !ok {
ic.RUnlock()
return nil
}
info := element.Value.(*proto.InodeInfo)
if inodeExpired(info) && DisableMetaCache {
ic.RUnlock()
// log.LogDebugf("InodeCache GetConnect expired: now(%v) inode(%v), expired(%d)", time.Now().Format(LogTimeFormat), info.Inode, info.Expiration())
return nil
}
ic.RUnlock()
return info
}
// Delete deletes the inode info based on the given inode number.
func (ic *InodeCache) Delete(ino uint64) {
// log.LogDebugf("InodeCache Delete: ino(%v)", ino)
ic.Lock()
element, ok := ic.cache[ino]
if ok {
ic.lruList.Remove(element)
delete(ic.cache, ino)
}
ic.Unlock()
}
// Foreground eviction cares more about the speed.
// Background eviction evicts all expired items from the cache.
// The caller should grab the WRITE lock of the inode cache.
func (ic *InodeCache) evict(foreground bool) {
var count int
for i := 0; i < MinInodeCacheEvictNum; i++ {
element := ic.lruList.Back()
if element == nil {
return
}
// For background eviction, if all expired items have been evicted, just return
// But for foreground eviction, we need to evict at least MinInodeCacheEvictNum inodes.
// The foreground eviction, does not need to care if the inode has expired or not.
info := element.Value.(*proto.InodeInfo)
if !foreground && !inodeExpired(info) {
return
}
// log.LogDebugf("InodeCache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
ic.lruList.Remove(element)
delete(ic.cache, info.Inode)
count++
}
// For background eviction, we need to continue evict all expired items from the cache
if foreground {
return
}
for i := 0; i < MaxInodeCacheEvictNum; i++ {
element := ic.lruList.Back()
if element == nil {
break
}
info := element.Value.(*proto.InodeInfo)
if !inodeExpired(info) {
break
}
// log.LogDebugf("InodeCache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
ic.lruList.Remove(element)
delete(ic.cache, info.Inode)
count++
}
}
func (ic *InodeCache) backgroundEviction() {
t := time.NewTicker(BgEvictionInterval)
defer t.Stop()
for range t.C {
log.LogInfof("InodeCache: start BG evict")
if !DisableMetaCache {
log.LogInfof("InodeCache: no need to do BG evict")
continue
}
start := time.Now()
ic.Lock()
ic.evict(false)
ic.Unlock()
elapsed := time.Since(start)
log.LogInfof("InodeCache: total inode cache(%d), cost(%d)ns", ic.lruList.Len(), elapsed.Nanoseconds())
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"time"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
const (
LogTimeFormat = "20060102150405000"
)
func (s *Super) InodeGet(ino uint64) (*proto.InodeInfo, error) {
info := s.ic.Get(ino)
if info != nil {
return info, nil
}
info, err := s.mw.InodeGet_ll(ino)
if err != nil || info == nil {
log.LogErrorf("InodeGet: ino(%v) err(%v) info(%v)", ino, err, info)
if err != nil {
return nil, ParseError(err)
} else {
return nil, fuse.ENOENT
}
}
s.ic.Put(info)
s.fslock.Lock()
node, isFind := s.nodeCache[ino]
s.fslock.Unlock()
if isFind {
s, ok := node.(*Dir)
if ok {
s.info = info
} else {
node.(*File).info = info
}
}
s.ec.RefreshExtentsCache(ino)
return info, nil
}
func setattr(info *proto.InodeInfo, req *fuse.SetattrRequest) (valid uint32) {
if req.Valid.Mode() {
info.Mode = proto.Mode(req.Mode)
valid |= proto.AttrMode
}
if req.Valid.Uid() {
info.Uid = req.Uid
valid |= proto.AttrUid
}
if req.Valid.Gid() {
info.Gid = req.Gid
valid |= proto.AttrGid
}
if req.Valid.Atime() {
info.AccessTime = req.Atime
valid |= proto.AttrAccessTime
}
if req.Valid.Mtime() {
info.ModifyTime = req.Mtime
valid |= proto.AttrModifyTime
}
return
}
func fillAttr(info *proto.InodeInfo, attr *fuse.Attr) {
attr.Valid = AttrValidDuration
attr.Nlink = info.Nlink
attr.Inode = info.Inode
attr.Mode = proto.OsMode(info.Mode)
attr.Size = info.Size
attr.Blocks = attr.Size >> 9 // In 512 bytes
attr.Atime = info.AccessTime
attr.Ctime = info.CreateTime
attr.Mtime = info.ModifyTime
attr.BlockSize = DefaultBlksize
attr.Uid = info.Uid
attr.Gid = info.Gid
}
func inodeExpired(info *proto.InodeInfo) bool {
return time.Now().UnixNano() > info.Expiration()
}
func inodeSetExpiration(info *proto.InodeInfo, t time.Duration) {
info.SetExpiration(time.Now().Add(t).UnixNano())
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"container/list"
"sync"
)
// OrphanInodeList defines the orphan inode list, which is a list of orphan inodes.
// An orphan inode is the inode whose nlink value is 0.
type OrphanInodeList struct {
sync.RWMutex
cache map[uint64]*list.Element
list *list.List
}
// NewOrphanInodeList returns a new orphan inode list.
func NewOrphanInodeList() *OrphanInodeList {
return &OrphanInodeList{
cache: make(map[uint64]*list.Element),
list: list.New(),
}
}
// Put puts an inode into the orphan inode list.
func (l *OrphanInodeList) Put(ino uint64) {
l.Lock()
defer l.Unlock()
_, ok := l.cache[ino]
if !ok {
element := l.list.PushFront(ino)
l.cache[ino] = element
}
}
// Evict remove the given inode from the orphan inode list, and evicts it.
func (l *OrphanInodeList) Evict(ino uint64) bool {
l.Lock()
defer l.Unlock()
element, ok := l.cache[ino]
if !ok {
return false
}
l.list.Remove(element)
delete(l.cache, ino)
return true
}
package fs
import (
"container/list"
"sync"
"time"
"github.com/cubefs/cubefs/sdk/meta"
)
const (
MinSummaryCacheEvictNum = 10
MaxSummaryCacheEvictNum = 200000
SummaryBgEvictionInterval = 2 * time.Minute
DefaultSummaryExpiration = 2 * time.Minute
MaxSummaryCache = 1000000
)
// SummaryCache defines the structure of the content-summary cache.
type SummaryCache struct {
sync.RWMutex
cache map[uint64]*list.Element
lruList *list.List
expiration time.Duration
maxElements int
}
// summaryCacheElement defines the structure of the content-summary cache's element.
type summaryCacheElement struct {
ino uint64
info *meta.SummaryInfo
expiration int64
}
// NewSummaryCache returns a new content-summary cache.
func NewSummaryCache(exp time.Duration, maxElement int) *SummaryCache {
sc := &SummaryCache{
cache: make(map[uint64]*list.Element),
lruList: list.New(),
expiration: exp,
maxElements: maxElement,
}
go sc.backgroundEviction()
return sc
}
// Put puts the given summary info into the content-summary cache.
func (sc *SummaryCache) Put(inode uint64, summaryInfo *meta.SummaryInfo) {
sc.Lock()
old, ok := sc.cache[inode]
if ok {
sc.lruList.Remove(old)
delete(sc.cache, inode)
}
if sc.lruList.Len() >= sc.maxElements {
sc.evict(true)
}
element := sc.lruList.PushFront(&summaryCacheElement{
ino: inode,
info: summaryInfo,
expiration: time.Now().Add(sc.expiration).UnixNano(),
})
sc.cache[inode] = element
sc.Unlock()
}
// Get returns the content-summary info based on the given inode number.
func (sc *SummaryCache) Get(inode uint64) *meta.SummaryInfo {
sc.RLock()
element, ok := sc.cache[inode]
if !ok {
sc.RUnlock()
return nil
}
info := element.Value.(*summaryCacheElement)
if cacheExpired(info) {
sc.RUnlock()
return nil
}
sc.RUnlock()
return info.info
}
// Delete deletes the content-summary info based on the given inode number.
func (sc *SummaryCache) Delete(inode uint64) {
sc.Lock()
element, ok := sc.cache[inode]
if ok {
sc.lruList.Remove(element)
delete(sc.cache, inode)
}
sc.Unlock()
}
func (sc *SummaryCache) evict(foreground bool) {
for i := 0; i < MinSummaryCacheEvictNum; i++ {
element := sc.lruList.Back()
if element == nil {
return
}
info := element.Value.(*summaryCacheElement)
if !foreground && !cacheExpired(info) {
return
}
sc.lruList.Remove(element)
delete(sc.cache, info.ino)
}
if foreground {
return
}
for i := 0; i < MaxSummaryCacheEvictNum; i++ {
element := sc.lruList.Back()
if element == nil {
break
}
info := element.Value.(*summaryCacheElement)
if !cacheExpired(info) {
break
}
sc.lruList.Remove(element)
delete(sc.cache, info.ino)
}
}
func (sc *SummaryCache) backgroundEviction() {
t := time.NewTicker(SummaryBgEvictionInterval)
defer t.Stop()
for {
select {
case <-t.C:
sc.Lock()
sc.evict(false)
sc.Unlock()
}
}
}
func cacheExpired(info *summaryCacheElement) bool {
if time.Now().UnixNano() > info.expiration {
return true
}
return false
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fs
import (
"context"
"fmt"
"net/http"
"os"
"path"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/blobstore/api/access"
"github.com/cubefs/cubefs/blockcache/bcache"
"github.com/cubefs/cubefs/client/common"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
"github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/blobstore"
"github.com/cubefs/cubefs/sdk/data/stream"
"github.com/cubefs/cubefs/sdk/meta"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/auditlog"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/ump"
)
// Super defines the struct of a super block.
type Super struct {
cluster string
volname string
masters string
mountPoint string
subDir string
owner string
ic *InodeCache
dc *Dcache
mw *meta.MetaWrapper
ec *stream.ExtentClient
orphan *OrphanInodeList
enSyncWrite bool
keepCache bool
nodeCache map[uint64]fs.Node
fslock sync.Mutex
disableDcache bool
fsyncOnClose bool
enableXattr bool
rootIno uint64
state fs.FSStatType
sockaddr string
suspendCh chan interface{}
// data lake
volType int
ebsEndpoint string
CacheAction int
CacheThreshold int
EbsBlockSize int
enableBcache bool
bcacheDir string
bcacheFilterFiles string
bcacheCheckInterval int64
bcacheBatchCnt int64
readThreads int
writeThreads int
bc *bcache.BcacheClient
ebsc *blobstore.BlobStoreClient
sc *SummaryCache
taskPool []common.TaskPool
closeC chan struct{}
enableVerRead bool
}
// Functions that Super needs to implement
var (
_ fs.FS = (*Super)(nil)
_ fs.FSStatfser = (*Super)(nil)
)
const (
BlobWriterIdleTimeoutPeriod = 10
DefaultTaskPoolSize = 30
)
// NewSuper returns a new Super.
func NewSuper(opt *proto.MountOptions) (s *Super, err error) {
s = new(Super)
masters := strings.Split(opt.Master, meta.HostsSeparator)
metaConfig := &meta.MetaConfig{
Volume: opt.Volname,
Owner: opt.Owner,
Masters: masters,
Authenticate: opt.Authenticate,
TicketMess: opt.TicketMess,
ValidateOwner: opt.Authenticate || opt.AccessKey == "",
EnableSummary: opt.EnableSummary && opt.EnableXattr,
MetaSendTimeout: opt.MetaSendTimeout,
}
s.mw, err = meta.NewMetaWrapper(metaConfig)
if err != nil {
return nil, errors.Trace(err, "NewMetaWrapper failed!"+err.Error())
}
s.SetTransaction(opt.EnableTransaction, opt.TxTimeout, opt.TxConflictRetryNum, opt.TxConflictRetryInterval)
s.mw.EnableQuota = opt.EnableQuota
s.volname = opt.Volname
s.masters = opt.Master
s.mountPoint = opt.MountPoint
s.subDir = opt.SubDir
s.owner = opt.Owner
s.cluster = s.mw.Cluster()
inodeExpiration := DefaultInodeExpiration
if opt.IcacheTimeout >= 0 {
inodeExpiration = time.Duration(opt.IcacheTimeout) * time.Second
}
if opt.LookupValid >= 0 {
LookupValidDuration = time.Duration(opt.LookupValid) * time.Second
}
if opt.AttrValid >= 0 {
AttrValidDuration = time.Duration(opt.AttrValid) * time.Second
}
if opt.EnSyncWrite > 0 {
s.enSyncWrite = true
}
s.keepCache = opt.KeepCache
if opt.MaxStreamerLimit > 0 {
s.ic = NewInodeCache(inodeExpiration, MaxInodeCache)
s.dc = NewDcache(inodeExpiration, MaxInodeCache)
} else {
s.ic = NewInodeCache(inodeExpiration, DefaultMaxInodeCache)
s.dc = NewDcache(inodeExpiration, DefaultMaxInodeCache)
}
s.orphan = NewOrphanInodeList()
s.nodeCache = make(map[uint64]fs.Node)
s.disableDcache = opt.DisableDcache
s.fsyncOnClose = opt.FsyncOnClose
s.enableXattr = opt.EnableXattr
s.bcacheCheckInterval = opt.BcacheCheckIntervalS
s.bcacheFilterFiles = opt.BcacheFilterFiles
s.bcacheBatchCnt = opt.BcacheBatchCnt
s.closeC = make(chan struct{}, 1)
s.taskPool = []common.TaskPool{common.New(DefaultTaskPoolSize, DefaultTaskPoolSize), common.New(DefaultTaskPoolSize, DefaultTaskPoolSize)}
if s.mw.EnableSummary {
s.sc = NewSummaryCache(DefaultSummaryExpiration, MaxSummaryCache)
}
if opt.MaxStreamerLimit > 0 {
DisableMetaCache = false
s.fsyncOnClose = false
}
if !strings.HasSuffix(opt.MountPoint, "/") {
opt.MountPoint = opt.MountPoint + "/"
}
if !strings.HasSuffix(opt.SubDir, "/") {
opt.SubDir = opt.SubDir + "/"
}
if opt.BcacheDir != "" && !strings.HasSuffix(opt.BcacheDir, "/") {
opt.BcacheDir = opt.BcacheDir + "/"
}
// use block cache and default use mountPoint as bcache dir
if opt.EnableBcache && opt.BcacheDir == "" {
s.bcacheDir = opt.MountPoint
}
if s.bcacheDir == opt.MountPoint {
s.bcacheDir = "/"
} else {
s.bcacheDir = strings.ReplaceAll(opt.BcacheDir, opt.MountPoint, "/")
if s.bcacheDir != "" && !strings.HasSuffix(s.bcacheDir, "/") {
s.bcacheDir = s.bcacheDir + "/"
}
}
s.volType = opt.VolType
s.ebsEndpoint = opt.EbsEndpoint
s.CacheAction = opt.CacheAction
s.CacheThreshold = opt.CacheThreshold
s.EbsBlockSize = opt.EbsBlockSize
s.enableBcache = opt.EnableBcache
s.readThreads = int(opt.ReadThreads)
s.writeThreads = int(opt.WriteThreads)
if s.enableBcache {
s.bc = bcache.NewBcacheClient()
}
extentConfig := &stream.ExtentConfig{
Volume: opt.Volname,
Masters: masters,
FollowerRead: opt.FollowerRead,
NearRead: opt.NearRead,
ReadRate: opt.ReadRate,
WriteRate: opt.WriteRate,
VolumeType: opt.VolType,
BcacheEnable: opt.EnableBcache,
BcacheDir: opt.BcacheDir,
MaxStreamerLimit: opt.MaxStreamerLimit,
VerReadSeq: opt.VerReadSeq,
OnAppendExtentKey: s.mw.AppendExtentKey,
OnSplitExtentKey: s.mw.SplitExtentKey,
OnGetExtents: s.mw.GetExtents,
OnTruncate: s.mw.Truncate,
OnEvictIcache: s.ic.Delete,
OnLoadBcache: s.bc.Get,
OnCacheBcache: s.bc.Put,
OnEvictBcache: s.bc.Evict,
DisableMetaCache: DisableMetaCache,
MinWriteAbleDataPartitionCnt: opt.MinWriteAbleDataPartitionCnt,
}
s.ec, err = stream.NewExtentClient(extentConfig)
if err != nil {
return nil, errors.Trace(err, "NewExtentClient failed!")
}
s.mw.VerReadSeq = s.ec.GetReadVer()
if proto.IsCold(opt.VolType) {
s.ebsc, err = blobstore.NewEbsClient(access.Config{
ConnMode: access.NoLimitConnMode,
Consul: access.ConsulConfig{
Address: opt.EbsEndpoint,
},
MaxSizePutOnce: MaxSizePutOnce,
Logger: &access.Logger{
Filename: path.Join(opt.Logpath, "client/ebs.log"),
},
})
if err != nil {
return nil, errors.Trace(err, "NewEbsClient failed!")
}
}
s.mw.Client = s.ec
if !opt.EnablePosixACL {
opt.EnablePosixACL = s.ec.GetEnablePosixAcl()
}
if s.rootIno, err = s.mw.GetRootIno(opt.SubDir); err != nil {
return nil, err
}
s.suspendCh = make(chan interface{})
if proto.IsCold(opt.VolType) {
go s.scheduleFlush()
}
if s.mw.EnableSummary {
s.sc = NewSummaryCache(DefaultSummaryExpiration, MaxSummaryCache)
}
if opt.NeedRestoreFuse {
atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatRestore))
}
log.LogInfof("NewSuper: cluster(%v) volname(%v) icacheExpiration(%v) LookupValidDuration(%v) AttrValidDuration(%v) state(%v)",
s.cluster, s.volname, inodeExpiration, LookupValidDuration, AttrValidDuration, s.state)
go s.loopSyncMeta()
return s, nil
}
func (s *Super) scheduleFlush() {
t := time.NewTicker(2 * time.Second)
defer t.Stop()
for {
select {
case <-t.C:
ctx := context.Background()
s.fslock.Lock()
for ino, node := range s.nodeCache {
if _, ok := node.(*File); !ok {
continue
}
file := node.(*File)
if atomic.LoadInt32(&file.idle) >= BlobWriterIdleTimeoutPeriod {
if file.fWriter != nil {
atomic.StoreInt32(&file.idle, 0)
go file.fWriter.Flush(ino, ctx)
}
} else {
atomic.AddInt32(&file.idle, 1)
}
}
s.fslock.Unlock()
}
}
}
// Root returns the root directory where it resides.
func (s *Super) Root() (fs.Node, error) {
inode, err := s.InodeGet(s.rootIno)
if err != nil {
return nil, err
}
root := NewDir(s, inode, inode.Inode, "")
return root, nil
}
func (s *Super) Node(ino, pino uint64, mode uint32) (fs.Node, error) {
var node fs.Node
// Create a fake InodeInfo. All File or Dir operations only use
// InodeInfo.Inode.
fakeInfo := &proto.InodeInfo{Inode: ino, Mode: mode}
if proto.OsMode(fakeInfo.Mode).IsDir() {
node = NewDir(s, fakeInfo, pino, "")
} else {
node = NewFile(s, fakeInfo, DefaultFlag, pino, "")
// The node is saved in FuseContextNodes list, that means
// the node is not evict. So we create a streamer for it,
// and streamer's refcnt is 0.
file := node.(*File)
file.Open(nil, nil, nil)
file.Release(nil, nil)
}
s.fslock.Lock()
s.nodeCache[ino] = node
s.fslock.Unlock()
return node, nil
}
// Statfs handles the Statfs request and returns a set of statistics.
func (s *Super) Statfs(ctx context.Context, req *fuse.StatfsRequest, resp *fuse.StatfsResponse) error {
const defaultMaxMetaPartitionInodeID uint64 = 1<<63 - 1
total, used, inodeCount := s.mw.Statfs()
resp.Blocks = total / uint64(DefaultBlksize)
resp.Bfree = (total - used) / uint64(DefaultBlksize)
resp.Bavail = resp.Bfree
resp.Bsize = DefaultBlksize
resp.Namelen = DefaultMaxNameLen
resp.Frsize = DefaultBlksize
resp.Files = inodeCount
resp.Ffree = defaultMaxMetaPartitionInodeID - inodeCount
return nil
}
// ClusterName returns the cluster name.
func (s *Super) ClusterName() string {
return s.cluster
}
func (s *Super) GetRate(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(s.ec.GetRate()))
}
func (s *Super) SetRate(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
w.Write([]byte(err.Error()))
return
}
if rate := r.FormValue("read"); rate != "" {
val, err := strconv.Atoi(rate)
if err != nil {
w.Write([]byte("Set read rate failed\n"))
} else {
msg := s.ec.SetReadRate(val)
w.Write([]byte(fmt.Sprintf("Set read rate to %v successfully\n", msg)))
}
}
if rate := r.FormValue("write"); rate != "" {
val, err := strconv.Atoi(rate)
if err != nil {
w.Write([]byte("Set write rate failed\n"))
} else {
msg := s.ec.SetWriteRate(val)
w.Write([]byte(fmt.Sprintf("Set write rate to %v successfully\n", msg)))
}
}
}
func (s *Super) exporterKey(act string) string {
return fmt.Sprintf("%v_fuseclient_%v", s.cluster, act)
}
func (s *Super) umpKey(act string) string {
return fmt.Sprintf("%v_fuseclient_%v", s.cluster, act)
}
func (s *Super) handleError(op, msg string) {
log.LogError(msg)
ump.Alarm(s.umpKey(op), msg)
}
func replyFail(w http.ResponseWriter, r *http.Request, msg string) {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte(msg))
}
func replySucc(w http.ResponseWriter, r *http.Request, msg string) {
w.WriteHeader(http.StatusOK)
w.Write([]byte(msg))
}
func (s *Super) SetSockAddr(addr string) {
s.sockaddr = addr
}
func (s *Super) SetSuspend(w http.ResponseWriter, r *http.Request) {
var (
err error
ret string
)
if err = r.ParseForm(); err != nil {
replyFail(w, r, err.Error())
return
}
sockaddr := r.FormValue("sock")
if sockaddr == "" {
err = fmt.Errorf("NeedAfterAlloc parameter 'sock' for IPC")
replyFail(w, r, err.Error())
return
}
s.fslock.Lock()
if s.sockaddr != "" ||
!atomic.CompareAndSwapUint32((*uint32)(&s.state), uint32(fs.FSStatResume), uint32(fs.FSStatSuspend)) {
s.fslock.Unlock()
err = fmt.Errorf("Already in suspend: sock '%s', state %v", s.sockaddr, s.state)
replyFail(w, r, err.Error())
return
}
s.sockaddr = sockaddr
s.fslock.Unlock()
// wait
msg := <-s.suspendCh
switch msg.(type) {
case error:
err = msg.(error)
case string:
ret = msg.(string)
default:
err = fmt.Errorf("Unknown return type: %v", msg)
}
if err != nil {
s.fslock.Lock()
atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
s.sockaddr = ""
s.fslock.Unlock()
replyFail(w, r, err.Error())
return
}
if !atomic.CompareAndSwapUint32((*uint32)(&s.state), uint32(fs.FSStatSuspend), uint32(fs.FSStatShutdown)) {
s.fslock.Lock()
atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
s.sockaddr = ""
s.fslock.Unlock()
err = fmt.Errorf("Invalid old state %v", s.state)
replyFail(w, r, err.Error())
return
}
replySucc(w, r, fmt.Sprintf("set suspend successfully: %s", ret))
}
func (s *Super) SetResume(w http.ResponseWriter, r *http.Request) {
s.fslock.Lock()
atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
s.sockaddr = ""
s.fslock.Unlock()
replySucc(w, r, "set resume successfully")
}
func (s *Super) EnableAuditLog(w http.ResponseWriter, r *http.Request) {
var err error
if err = r.ParseForm(); err != nil {
auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
logPath := r.FormValue("path")
if logPath == "" {
err = fmt.Errorf("path cannot be empty")
auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
prefix := r.FormValue("prefix")
if prefix == "" {
err = fmt.Errorf("prefix cannot be empty")
auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
logSize := 0
if logSizeStr := r.FormValue("logsize"); logSizeStr != "" {
val, err := strconv.Atoi(logSizeStr)
if err != nil {
err = fmt.Errorf("logSize error")
auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
logSize = val
} else {
logSize = auditlog.DefaultAuditLogSize
}
dir, logModule, logMaxSize, err := auditlog.GetAuditLogInfo()
if err != nil {
_, err = auditlog.InitAuditWithPrefix(logPath, prefix, int64(auditlog.DefaultAuditLogSize),
auditlog.NewAuditPrefix(s.masters, s.volname, s.subDir, s.mountPoint))
if err != nil {
err = errors.NewErrorf("Init audit log fail: %v\n", err)
auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
info := fmt.Sprintf("audit log is initialized with params: logDir(%v) logModule(%v) logMaxSize(%v)",
logPath, prefix, logSize)
auditlog.BuildSuccessResp(w, info)
} else {
info := fmt.Sprintf("audit log is already initialized with params: logDir(%v) logModule(%v) logMaxSize(%v)",
dir, logModule, logMaxSize)
auditlog.BuildSuccessResp(w, info)
}
}
func (s *Super) State() (state fs.FSStatType, sockaddr string) {
return fs.FSStatType(atomic.LoadUint32((*uint32)(&s.state))), s.sockaddr
}
func (s *Super) Notify(stat fs.FSStatType, msg interface{}) {
if stat == fs.FSStatSuspend {
s.suspendCh <- msg
} else if stat == fs.FSStatRestore {
s.fslock.Lock()
atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
s.sockaddr = ""
s.fslock.Unlock()
}
}
func (s *Super) loopSyncMeta() {
if s.bcacheDir == "" {
return
}
for {
finishC := s.syncMeta()
select {
case <-finishC:
time.Sleep(time.Second * time.Duration(s.bcacheCheckInterval))
case <-s.closeC:
return
}
}
}
func (s *Super) syncMeta() <-chan struct{} {
finishC := make(chan struct{})
start := time.Now()
cacheLen := s.ic.lruList.Len()
allInodes := func() <-chan uint64 {
out := make(chan uint64)
go func() {
for i := s.ic.lruList.Front(); i != nil; i = i.Next() {
oldInfo := i.Value.(*proto.InodeInfo)
out <- oldInfo.Inode
}
close(out)
}()
return out
}()
getChanged := func(in <-chan uint64, batchCnt int64) <-chan uint64 {
out := make(chan uint64)
changed := make([]uint64, 0)
s.taskPool[0].Run(func() {
tmpInodes := make([]uint64, 0, batchCnt)
for i := range in {
tmpInodes = append(tmpInodes, i)
if len(tmpInodes) == int(batchCnt) {
changed = append(changed, s.getModifyInodes(tmpInodes)...)
tmpInodes = tmpInodes[:0]
}
}
if len(tmpInodes) != 0 {
changed = append(changed, s.getModifyInodes(tmpInodes)...)
}
for i := range changed {
out <- changed[i]
}
close(out)
})
return out
}
batCh := make([]<-chan uint64, DefaultTaskPoolSize/3)
for i := range batCh {
batCh[i] = getChanged(allInodes, s.bcacheBatchCnt)
}
mergeChanged := func(cs []<-chan uint64) <-chan uint64 {
var wg sync.WaitGroup
out := make(chan uint64)
wg.Add(len(cs))
for _, c := range cs {
go func(c <-chan uint64) {
for n := range c {
out <- n
}
wg.Done()
}(c)
}
go func() {
wg.Wait()
close(out)
}()
return out
}
var changeCnt int
for ino := range mergeChanged(batCh) {
inode := ino
changeCnt++
log.LogDebugf("sync meta,inode:%d changed", inode)
s.ic.Delete(inode)
s.taskPool[1].Run(func() {
common.Timed(3, 100).On(func() error {
extents := s.ec.GetExtents(inode)
if err := s.ec.ForceRefreshExtentsCache(inode); err != nil {
if err != os.ErrNotExist {
log.LogErrorf("ForceRefreshExtentsCache failed:%v", err)
}
}
log.LogDebugf("inode:%d,extents is :%v", inode, extents)
for _, extent := range extents {
cacheKey := util.GenerateRepVolKey(s.volname, inode, extent.PartitionId, extent.ExtentId, extent.FileOffset)
// retry to make possible evict success
if s.bc != nil {
common.Timed(3, 100).On(func() error {
return s.bc.Evict(cacheKey)
})
}
}
return nil
})
})
}
log.LogDebugf("total cache cnt:%d,changedCnt:%d,sync meta cost:%v", cacheLen, changeCnt, time.Since(start))
close(finishC)
return finishC
}
func (s *Super) getModifyInodes(inodes []uint64) (changedNodes []uint64) {
inodeInfos := s.mw.BatchInodeGet(inodes)
// get deleted files
if len(inodeInfos) != len(inodes) {
changedNodes = append(changedNodes, getDelInodes(inodes, inodeInfos)...)
log.LogDebugf("len inodes is %d, len get inode infos is :%d, del inodes is:%v", len(inodes), len(inodeInfos), changedNodes)
}
for _, newInfo := range inodeInfos {
oldInfo := s.ic.Get(newInfo.Inode)
if oldInfo == nil {
continue
}
if !oldInfo.ModifyTime.Equal(newInfo.ModifyTime) || newInfo.Generation != s.ec.GetExtentCacheGen(newInfo.Inode) {
log.LogDebugf("oldInfo:ino(%d) modifyTime(%v) gen(%d),newInfo:ino(%d) modifyTime(%d) gen(%d)", oldInfo.Inode, oldInfo.ModifyTime.Unix(), s.ec.GetExtentCacheGen(newInfo.Inode), newInfo.Inode, newInfo.ModifyTime.Unix(), newInfo.Generation)
changedNodes = append(changedNodes, newInfo.Inode)
} else {
log.LogDebugf("oldInfo:ino(%d) modifyTime(%v) gen(%d),newInfo:ino(%d) modifyTime(%d) gen(%d)", oldInfo.Inode, oldInfo.ModifyTime.Unix(), s.ec.GetExtentCacheGen(newInfo.Inode), newInfo.Inode, newInfo.ModifyTime.Unix(), newInfo.Generation)
}
}
return
}
func getDelInodes(src []uint64, act []*proto.InodeInfo) []uint64 {
delInodes := make([]uint64, 0)
m := make(map[uint64]struct{})
for _, iInfo := range act {
m[iInfo.Inode] = struct{}{}
}
for _, inode := range src {
if _, ok := m[inode]; !ok {
delInodes = append(delInodes, inode)
}
}
return delInodes
}
func (s *Super) Close() {
close(s.closeC)
}
func (s *Super) SetTransaction(txMaskStr string, timeout int64, retryNum int64, retryInterval int64) {
// maskStr := proto.GetMaskString(txMask)
mask, err := proto.GetMaskFromString(txMaskStr)
if err != nil {
log.LogErrorf("SetTransaction: err[%v], op[%v], timeout[%v]", err, txMaskStr, timeout)
return
}
s.mw.EnableTransaction = mask
if timeout <= 0 {
timeout = proto.DefaultTransactionTimeout
}
s.mw.TxTimeout = timeout
if retryNum <= 0 {
retryNum = proto.DefaultTxConflictRetryNum
}
s.mw.TxConflictRetryNum = retryNum
if retryInterval <= 0 {
retryInterval = proto.DefaultTxConflictRetryInterval
}
s.mw.TxConflictRetryInterval = retryInterval
log.LogDebugf("SetTransaction: mask[%v], op[%v], timeout[%v], retryNum[%v], retryInterval[%v ms]",
mask, txMaskStr, timeout, retryNum, retryInterval)
}
package common
import (
"sync"
"sync/atomic"
"github.com/cubefs/cubefs/util/config"
)
const (
StateStandby uint32 = iota
StateStart
StateRunning
StateShutdown
StateStopped
)
type Control struct {
state uint32
wg sync.WaitGroup
}
type Server interface {
Start(cfg *config.Config) error
Shutdown()
// Sync will block invoker goroutine until this MetaNode shutdown.
Sync()
}
type (
DoStartFunc func(s Server, cfg *config.Config) (err error)
DoShutdownFunc func(s Server)
)
func (c *Control) Start(s Server, cfg *config.Config, do DoStartFunc) (err error) {
if atomic.CompareAndSwapUint32(&c.state, StateStandby, StateStart) {
defer func() {
var newState uint32
if err != nil {
newState = StateStandby
} else {
newState = StateRunning
}
atomic.StoreUint32(&c.state, newState)
}()
if err = do(s, cfg); err != nil {
return
}
c.wg.Add(1)
}
return
}
func (c *Control) Shutdown(s Server, do DoShutdownFunc) {
if atomic.CompareAndSwapUint32(&c.state, StateRunning, StateShutdown) {
do(s)
c.wg.Done()
atomic.StoreUint32(&c.state, StateStopped)
}
}
func (c *Control) Sync() {
if atomic.LoadUint32(&c.state) == StateRunning {
c.wg.Wait()
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"encoding/binary"
"encoding/json"
"fmt"
"hash/crc32"
"math"
"net"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// DataPartitionRepairTask defines the repair task for the data partition.
type DataPartitionRepairTask struct {
TaskType uint8
addr string
extents map[uint64]*storage.ExtentInfo
ExtentsToBeCreated []*storage.ExtentInfo
ExtentsToBeRepaired []*storage.ExtentInfo
LeaderTinyDeleteRecordFileSize int64
LeaderAddr string
}
func NewDataPartitionRepairTask(extentFiles []*storage.ExtentInfo, tinyDeleteRecordFileSize int64, source, leaderAddr string) (task *DataPartitionRepairTask) {
task = &DataPartitionRepairTask{
extents: make(map[uint64]*storage.ExtentInfo),
ExtentsToBeCreated: make([]*storage.ExtentInfo, 0),
ExtentsToBeRepaired: make([]*storage.ExtentInfo, 0),
LeaderTinyDeleteRecordFileSize: tinyDeleteRecordFileSize,
LeaderAddr: leaderAddr,
}
for _, extentFile := range extentFiles {
extentFile.Source = source
task.extents[extentFile.FileID] = extentFile
}
return
}
// Main function to perform the repair.
// The repair process can be described as follows:
// There are two types of repairs.
// The first one is called the normal extent repair, and the second one is called the tiny extent repair.
// 1. normal extent repair:
// - the leader collects all the extent information from the followers.
// - for each extent, we compare all the replicas to find the one with the largest size.
// - periodically check the size of the local extent, and if it is smaller than the largest size,
// add it to the tobeRepaired list, and generate the corresponding tasks.
// 2. tiny extent repair:
// - when creating the new partition, add all tiny extents to the toBeRepaired list,
// and the repair task will create all the tiny extents first.
// - The leader of the replicas periodically collects the extent information of each follower
// - for each extent, we compare all the replicas to find the one with the largest size.
// - periodically check the size of the local extent, and if it is smaller than the largest size,
// add it to the tobeRepaired list, and generate the corresponding tasks.
func (dp *DataPartition) repair(extentType uint8) {
start := time.Now().UnixNano()
log.LogInfof("action[repair] partition(%v) start.", dp.partitionID)
var tinyExtents []uint64 // unavailable extents
if proto.IsTinyExtentType(extentType) {
tinyExtents = dp.brokenTinyExtents()
if len(tinyExtents) == 0 {
return
}
}
// fix dp replica index panic , using replica copy
replica := dp.getReplicaCopy()
repairTasks := make([]*DataPartitionRepairTask, len(replica))
err := dp.buildDataPartitionRepairTask(repairTasks, extentType, tinyExtents, replica)
if err != nil {
log.LogErrorf(errors.Stack(err))
log.LogErrorf("action[repair] partition(%v) err(%v).",
dp.partitionID, err)
dp.moveToBrokenTinyExtentC(extentType, tinyExtents)
return
}
log.LogInfof("action[repair] partition(%v) before prepareRepairTasks", dp.partitionID)
// compare all the extents in the replicas to compute the good and bad ones
availableTinyExtents, brokenTinyExtents := dp.prepareRepairTasks(repairTasks)
// notify the replicas to repair the extent
err = dp.NotifyExtentRepair(repairTasks)
if err != nil {
dp.sendAllTinyExtentsToC(extentType, availableTinyExtents, brokenTinyExtents)
log.LogErrorf("action[repair] partition(%v) err(%v).",
dp.partitionID, err)
log.LogError(errors.Stack(err))
return
}
log.LogDebugf("DoRepair")
// ask the leader to do the repair
dp.DoRepair(repairTasks)
end := time.Now().UnixNano()
// every time we need to figure out which extents need to be repaired and which ones do not.
dp.sendAllTinyExtentsToC(extentType, availableTinyExtents, brokenTinyExtents)
// error check
if dp.extentStore.AvailableTinyExtentCnt()+dp.extentStore.BrokenTinyExtentCnt() > storage.TinyExtentCount {
log.LogWarnf("action[repair] partition(%v) GoodTinyExtents(%v) "+
"BadTinyExtents(%v) finish cost[%vms].", dp.partitionID, dp.extentStore.AvailableTinyExtentCnt(),
dp.extentStore.BrokenTinyExtentCnt(), (end-start)/int64(time.Millisecond))
}
log.LogInfof("action[repair] partition(%v) GoodTinyExtents(%v) BadTinyExtents(%v)"+
" finish cost[%vms] masterAddr(%v).", dp.partitionID, dp.extentStore.AvailableTinyExtentCnt(),
dp.extentStore.BrokenTinyExtentCnt(), (end-start)/int64(time.Millisecond), MasterClient.Nodes())
}
func (dp *DataPartition) buildDataPartitionRepairTask(repairTasks []*DataPartitionRepairTask, extentType uint8, tinyExtents []uint64, replica []string) (err error) {
// get the local extent info
extents, leaderTinyDeleteRecordFileSize, err := dp.getLocalExtentInfo(extentType, tinyExtents)
if err != nil {
return err
}
// new repair task for the leader
log.LogInfof("buildDataPartitionRepairTask dp %v, extent type %v, len extent %v, replica size %v", dp.partitionID, extentType, len(extents), len(replica))
repairTasks[0] = NewDataPartitionRepairTask(extents, leaderTinyDeleteRecordFileSize, replica[0], replica[0])
repairTasks[0].addr = replica[0]
// new repair tasks for the followers
for index := 1; index < len(replica); index++ {
extents, err := dp.getRemoteExtentInfo(extentType, tinyExtents, replica[index])
if err != nil {
log.LogErrorf("buildDataPartitionRepairTask PartitionID(%v) on (%v) err(%v)", dp.partitionID, replica[index], err)
continue
}
log.LogInfof("buildDataPartitionRepairTask dp %v, add new add %v, extent type %v", dp.partitionID, replica[index], extentType)
repairTasks[index] = NewDataPartitionRepairTask(extents, leaderTinyDeleteRecordFileSize, replica[index], replica[0])
repairTasks[index].addr = replica[index]
}
return
}
func (dp *DataPartition) getLocalExtentInfo(extentType uint8, tinyExtents []uint64) (extents []*storage.ExtentInfo, leaderTinyDeleteRecordFileSize int64, err error) {
var localExtents []*storage.ExtentInfo
if proto.IsNormalExtentType(extentType) {
localExtents, leaderTinyDeleteRecordFileSize, err = dp.extentStore.GetAllWatermarks(storage.NormalExtentFilter())
} else {
localExtents, leaderTinyDeleteRecordFileSize, err = dp.extentStore.GetAllWatermarks(storage.TinyExtentFilter(tinyExtents))
}
if err != nil {
err = errors.Trace(err, "getLocalExtentInfo extent DataPartition(%v) GetAllWaterMark", dp.partitionID)
return
}
if len(localExtents) <= 0 {
extents = make([]*storage.ExtentInfo, 0)
return
}
extents = make([]*storage.ExtentInfo, 0, len(localExtents))
for _, et := range localExtents {
newEt := *et
extents = append(extents, &newEt)
}
return
}
func (dp *DataPartition) getRemoteExtentInfo(extentType uint8, tinyExtents []uint64,
target string) (extentFiles []*storage.ExtentInfo, err error) {
p := repl.NewPacketToGetAllWatermarks(dp.partitionID, extentType)
extentFiles = make([]*storage.ExtentInfo, 0)
if proto.IsTinyExtentType(extentType) {
p.Data, err = json.Marshal(tinyExtents)
if err != nil {
err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) GetAllWatermarks", dp.partitionID)
return
}
p.Size = uint32(len(p.Data))
}
var conn *net.TCPConn
conn, err = gConnPool.GetConnect(target) // get remote connection
if err != nil {
err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) get host(%v) connect", dp.partitionID, target)
return
}
defer func() {
gConnPool.PutConnect(conn, err != nil)
}()
err = p.WriteToConn(conn) // write command to the remote host
if err != nil {
err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) write to host(%v)", dp.partitionID, target)
return
}
reply := new(repl.Packet)
err = reply.ReadFromConnWithVer(conn, proto.GetAllWatermarksDeadLineTime) // read the response
if err != nil {
err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) read from host(%v)", dp.partitionID, target)
return
}
err = json.Unmarshal(reply.Data[:reply.Size], &extentFiles)
if err != nil {
err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) unmarshal json(%v) from host(%v)",
dp.partitionID, string(reply.Data[:reply.Size]), target)
return
}
return
}
// DoRepair asks the leader to perform the repair tasks.
func (dp *DataPartition) DoRepair(repairTasks []*DataPartitionRepairTask) {
store := dp.extentStore
for _, extentInfo := range repairTasks[0].ExtentsToBeCreated {
if !AutoRepairStatus {
log.LogWarnf("AutoRepairStatus is False,so cannot Create extent(%v),pid=%d", extentInfo.String(), dp.partitionID)
continue
}
if dp.ExtentStore().IsDeletedNormalExtent(extentInfo.FileID) {
continue
}
dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
store.Create(extentInfo.FileID)
}
log.LogDebugf("action[DoRepair] leader to repair len[%v], {%v}", len(repairTasks[0].ExtentsToBeRepaired), repairTasks[0].ExtentsToBeRepaired)
for _, extentInfo := range repairTasks[0].ExtentsToBeRepaired {
log.LogDebugf("action[DoRepair] leader to repair len[%v], {%v}", len(repairTasks[0].ExtentsToBeRepaired), extentInfo)
err := dp.streamRepairExtent(extentInfo)
if err != nil {
err = errors.Trace(err, "doStreamExtentFixRepair %v", dp.applyRepairKey(int(extentInfo.FileID)))
localExtentInfo, opErr := dp.ExtentStore().Watermark(uint64(extentInfo.FileID))
if opErr != nil {
err = errors.Trace(err, opErr.Error())
}
err = errors.Trace(err, "partition(%v) remote(%v) local(%v)",
dp.partitionID, extentInfo, localExtentInfo)
log.LogWarnf("action[doStreamExtentFixRepair] err(%v).", err)
}
}
}
func (dp *DataPartition) moveToBrokenTinyExtentC(extentType uint8, extents []uint64) {
if proto.IsTinyExtentType(extentType) {
dp.extentStore.SendAllToBrokenTinyExtentC(extents)
}
}
func (dp *DataPartition) sendAllTinyExtentsToC(extentType uint8, availableTinyExtents, brokenTinyExtents []uint64) {
if !proto.IsTinyExtentType(extentType) {
return
}
for _, extentID := range availableTinyExtents {
if storage.IsTinyExtent(extentID) {
dp.extentStore.SendToAvailableTinyExtentC(extentID)
}
}
for _, extentID := range brokenTinyExtents {
if storage.IsTinyExtent(extentID) {
dp.extentStore.SendToBrokenTinyExtentC(extentID)
}
}
}
func (dp *DataPartition) brokenTinyExtents() (brokenTinyExtents []uint64) {
brokenTinyExtents = make([]uint64, 0)
extentsToBeRepaired := MinTinyExtentsToRepair
if dp.extentStore.AvailableTinyExtentCnt() <= MinAvaliTinyExtentCnt {
extentsToBeRepaired = storage.TinyExtentCount
}
for i := 0; i < extentsToBeRepaired; i++ {
extentID, err := dp.extentStore.GetBrokenTinyExtent()
if err != nil {
return
}
brokenTinyExtents = append(brokenTinyExtents, extentID)
}
return
}
func (dp *DataPartition) prepareRepairTasks(repairTasks []*DataPartitionRepairTask) (availableTinyExtents []uint64, brokenTinyExtents []uint64) {
extentInfoMap := make(map[uint64]*storage.ExtentInfo)
deleteExtents := make(map[uint64]bool)
log.LogInfof("action[prepareRepairTasks] dp %v task len %v", dp.partitionID, len(repairTasks))
for index := 0; index < len(repairTasks); index++ {
repairTask := repairTasks[index]
if repairTask == nil {
continue
}
for extentID, extentInfo := range repairTask.extents {
if extentInfo.IsDeleted {
deleteExtents[extentID] = true
continue
}
extentWithMaxSize, ok := extentInfoMap[extentID]
if !ok {
extentInfoMap[extentID] = extentInfo
} else {
if extentInfo.TotalSize() > extentWithMaxSize.TotalSize() {
extentInfoMap[extentID] = extentInfo
}
}
// log.LogInfof("action[prepareRepairTasks] dp %v extentid %v addr[dst %v,leader %v] info %v", dp.partitionID, extentID, repairTask.addr, repairTask.LeaderAddr, extentInfoMap[extentID])
}
}
for extentID := range deleteExtents {
extentInfo := extentInfoMap[extentID]
if extentInfo != nil {
extentInfo.IsDeleted = true
extentInfoMap[extentID] = extentInfo
}
}
dp.buildExtentCreationTasks(repairTasks, extentInfoMap)
availableTinyExtents, brokenTinyExtents = dp.buildExtentRepairTasks(repairTasks, extentInfoMap)
return
}
// Create a new extent if one of the replica is missing.
func (dp *DataPartition) buildExtentCreationTasks(repairTasks []*DataPartitionRepairTask, extentInfoMap map[uint64]*storage.ExtentInfo) {
for extentID, extentInfo := range extentInfoMap {
if storage.IsTinyExtent(extentID) {
continue
}
for index := 0; index < len(repairTasks); index++ {
repairTask := repairTasks[index]
if repairTask == nil {
continue
}
if _, ok := repairTask.extents[extentID]; !ok && !extentInfo.IsDeleted {
if storage.IsTinyExtent(extentID) {
continue
}
if extentInfo.IsDeleted {
continue
}
if dp.ExtentStore().IsDeletedNormalExtent(extentID) {
continue
}
ei := &storage.ExtentInfo{Source: extentInfo.Source, FileID: extentID, Size: extentInfo.Size, SnapshotDataOff: extentInfo.SnapshotDataOff}
repairTask.ExtentsToBeCreated = append(repairTask.ExtentsToBeCreated, ei)
repairTask.ExtentsToBeRepaired = append(repairTask.ExtentsToBeRepaired, ei)
log.LogInfof("action[generatorAddExtentsTasks] addFile(%v_%v) on Index(%v).", dp.partitionID, ei, index)
}
}
}
}
// Repair an extent if the replicas do not have the same length.
func (dp *DataPartition) buildExtentRepairTasks(repairTasks []*DataPartitionRepairTask, maxSizeExtentMap map[uint64]*storage.ExtentInfo) (availableTinyExtents []uint64, brokenTinyExtents []uint64) {
availableTinyExtents = make([]uint64, 0)
brokenTinyExtents = make([]uint64, 0)
for extentID, maxFileInfo := range maxSizeExtentMap {
hasBeenRepaired := true
for index := 0; index < len(repairTasks); index++ {
if repairTasks[index] == nil {
continue
}
extentInfo, ok := repairTasks[index].extents[extentID]
if !ok {
continue
}
if extentInfo.IsDeleted {
continue
}
if dp.ExtentStore().IsDeletedNormalExtent(extentID) {
continue
}
if extentInfo.TotalSize() < maxFileInfo.TotalSize() {
fixExtent := &storage.ExtentInfo{Source: maxFileInfo.Source, FileID: extentID, Size: maxFileInfo.Size, SnapshotDataOff: maxFileInfo.SnapshotDataOff}
repairTasks[index].ExtentsToBeRepaired = append(repairTasks[index].ExtentsToBeRepaired, fixExtent)
log.LogInfof("action[generatorFixExtentSizeTasks] fixExtent(%v_%v) on Index(%v) on(%v).",
dp.partitionID, fixExtent, index, repairTasks[index].addr)
hasBeenRepaired = false
}
}
if storage.IsTinyExtent(extentID) {
if hasBeenRepaired {
availableTinyExtents = append(availableTinyExtents, extentID)
} else {
brokenTinyExtents = append(brokenTinyExtents, extentID)
}
}
}
return
}
func (dp *DataPartition) notifyFollower(wg *sync.WaitGroup, index int, members []*DataPartitionRepairTask) (err error) {
p := repl.NewPacketToNotifyExtentRepair(dp.partitionID) // notify all the followers to repair
var conn *net.TCPConn
// target := dp.getReplicaAddr(index)
// fix repair case panic,may be dp's replicas is change
target := members[index].addr
p.Data, _ = json.Marshal(members[index])
p.Size = uint32(len(p.Data))
conn, err = gConnPool.GetConnect(target)
defer func() {
wg.Done()
if err == nil {
log.LogInfof(ActionNotifyFollowerToRepair+" to host(%v) Partition(%v) done", target, dp.partitionID)
} else {
log.LogErrorf(ActionNotifyFollowerToRepair+" to host(%v) Partition(%v) failed, err(%v)", target, dp.partitionID, err)
}
}()
if err != nil {
return err
}
defer func() {
gConnPool.PutConnect(conn, err != nil)
}()
if err = p.WriteToConn(conn); err != nil {
return err
}
if err = p.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
return err
}
return err
}
// NotifyExtentRepair notifies the followers to repair.
func (dp *DataPartition) NotifyExtentRepair(members []*DataPartitionRepairTask) (err error) {
wg := new(sync.WaitGroup)
for i := 1; i < len(members); i++ {
if members[i] == nil || !dp.IsExistReplica(members[i].addr) {
if members[i] != nil {
log.LogInfof("notify extend repair is change ,index(%v),pid(%v),task_member_add(%v),IsExistReplica(%v)",
i, dp.partitionID, members[i].addr, dp.IsExistReplica(members[i].addr))
}
continue
}
wg.Add(1)
go dp.notifyFollower(wg, i, members)
}
wg.Wait()
return
}
// DoStreamExtentFixRepair executes the repair on the followers.
func (dp *DataPartition) doStreamExtentFixRepair(wg *sync.WaitGroup, remoteExtentInfo *storage.ExtentInfo) {
defer wg.Done()
err := dp.streamRepairExtent(remoteExtentInfo)
if err != nil {
// only decommission repair need to check err cnt
if dp.isDecommissionRecovering() {
atomic.AddUint64(&dp.recoverErrCnt, 1)
if atomic.LoadUint64(&dp.recoverErrCnt) >= dp.dataNode.GetDpMaxRepairErrCnt() {
dp.handleDecommissionRecoverFailed()
return
}
}
err = errors.Trace(err, "doStreamExtentFixRepair %v", dp.applyRepairKey(int(remoteExtentInfo.FileID)))
localExtentInfo, opErr := dp.ExtentStore().Watermark(uint64(remoteExtentInfo.FileID))
if opErr != nil {
err = errors.Trace(err, opErr.Error())
}
err = errors.Trace(err, "partition(%v) remote(%v) local(%v)",
dp.partitionID, remoteExtentInfo, localExtentInfo)
log.LogWarnf("action[doStreamExtentFixRepair] err(%v).", err)
}
}
func (dp *DataPartition) applyRepairKey(extentID int) (m string) {
return fmt.Sprintf("ApplyRepairKey(%v_%v)", dp.partitionID, extentID)
}
// The actual repair of an extent happens here.
func (dp *DataPartition) streamRepairExtent(remoteExtentInfo *storage.ExtentInfo) (err error) {
log.LogDebugf("streamRepairExtent dp %v remote info %v", dp.partitionID, remoteExtentInfo)
store := dp.ExtentStore()
if !store.HasExtent(remoteExtentInfo.FileID) {
log.LogDebugf("streamRepairExtent remote info %v not exist", remoteExtentInfo)
return
}
if !AutoRepairStatus && !storage.IsTinyExtent(remoteExtentInfo.FileID) {
log.LogWarnf("AutoRepairStatus is False,so cannot AutoRepair extent(%v)", remoteExtentInfo.String())
return
}
localExtentInfo, err := store.Watermark(remoteExtentInfo.FileID)
if err != nil {
log.LogDebugf("streamRepairExtent local %v remote info %v", localExtentInfo, remoteExtentInfo)
return errors.Trace(err, "streamRepairExtent Watermark error")
}
log.LogDebugf("streamRepairExtent dp %v remote info %v,local %v", dp.partitionID, remoteExtentInfo, localExtentInfo)
if dp.ExtentStore().IsDeletedNormalExtent(remoteExtentInfo.FileID) {
log.LogDebugf("streamRepairExtent local %v remote info %v", localExtentInfo, remoteExtentInfo)
return nil
}
if localExtentInfo.Size >= remoteExtentInfo.Size && localExtentInfo.SnapshotDataOff >= remoteExtentInfo.SnapshotDataOff {
log.LogDebugf("streamRepairExtent local %v remote info %v", localExtentInfo, remoteExtentInfo)
return nil
}
doWork := func(wType int, currFixOffset uint64, dstOffset uint64, request *repl.Packet) (err error) {
log.LogDebugf("streamRepairExtent. currFixOffset %v dstOffset %v, request %v", currFixOffset, dstOffset, request)
var conn net.Conn
conn, err = dp.getRepairConn(remoteExtentInfo.Source)
if err != nil {
return errors.Trace(err, "streamRepairExtent get conn from host(%v) error", remoteExtentInfo.Source)
}
defer func() {
dp.putRepairConn(conn, err != nil)
}()
if err = request.WriteToConn(conn); err != nil {
err = errors.Trace(err, "streamRepairExtent send streamRead to host(%v) error", remoteExtentInfo.Source)
log.LogWarnf("action[streamRepairExtent] err(%v).", err)
return
}
var hasRecoverySize uint64
var loopTimes uint64
for currFixOffset < dstOffset {
if currFixOffset >= dstOffset {
break
}
reply := repl.NewPacket()
// read 64k streaming repair packet
if err = reply.ReadFromConnWithVer(conn, 60); err != nil {
err = errors.Trace(err, "streamRepairExtent receive data error,localExtentSize(%v) remoteExtentSize(%v)", currFixOffset, dstOffset)
return
}
if reply.ResultCode != proto.OpOk {
err = errors.Trace(fmt.Errorf("unknow result code"),
"streamRepairExtent receive opcode error(%v) ,localExtentSize(%v) remoteExtentSize(%v)", string(reply.Data[:intMin(len(reply.Data), int(reply.Size))]), currFixOffset, remoteExtentInfo.Size)
return
}
if reply.ReqID != request.ReqID || reply.PartitionID != request.PartitionID ||
reply.ExtentID != request.ExtentID {
err = errors.Trace(fmt.Errorf("unavali reply"), "streamRepairExtent receive unavalid "+
"request(%v) reply(%v) ,localExtentSize(%v) remoteExtentSize(%v)", request.GetUniqueLogId(), reply.GetUniqueLogId(), currFixOffset, dstOffset)
return
}
if !storage.IsTinyExtent(reply.ExtentID) && (reply.Size == 0 || reply.ExtentOffset != int64(currFixOffset)) {
err = errors.Trace(fmt.Errorf("unavali reply"), "streamRepairExtent receive unavalid "+
"request(%v) reply(%v) localExtentSize(%v) remoteExtentSize(%v)", request.GetUniqueLogId(), reply.GetUniqueLogId(), currFixOffset, dstOffset)
return
}
if loopTimes%100 == 0 {
log.LogInfof(fmt.Sprintf("action[streamRepairExtent] fix(%v_%v) start fix from (%v)"+
" remoteSize(%v)localSize(%v) reply(%v).", dp.partitionID, localExtentInfo.FileID, remoteExtentInfo.String(),
dstOffset, currFixOffset, reply.GetUniqueLogId()))
}
loopTimes++
actualCrc := crc32.ChecksumIEEE(reply.Data[:reply.Size])
if reply.CRC != actualCrc {
err = fmt.Errorf("streamRepairExtent crc mismatch expectCrc(%v) actualCrc(%v) extent(%v_%v) start fix from (%v)"+
" remoteSize(%v) localSize(%v) request(%v) reply(%v) ", reply.CRC, actualCrc, dp.partitionID, remoteExtentInfo.String(),
remoteExtentInfo.Source, dstOffset, currFixOffset, request.GetUniqueLogId(), reply.GetUniqueLogId())
return errors.Trace(err, "streamRepairExtent receive data error")
}
isEmptyResponse := false
// Write it to local extent file
if storage.IsTinyExtent(uint64(localExtentInfo.FileID)) {
currRecoverySize := uint64(reply.Size)
var remoteAvaliSize uint64
if reply.ArgLen == TinyExtentRepairReadResponseArgLen {
remoteAvaliSize = binary.BigEndian.Uint64(reply.Arg[9:TinyExtentRepairReadResponseArgLen])
}
if reply.Arg != nil { // compact v1.2.0 recovery
isEmptyResponse = reply.Arg[0] == EmptyResponse
}
if isEmptyResponse {
currRecoverySize = binary.BigEndian.Uint64(reply.Arg[1:9])
reply.Size = uint32(currRecoverySize)
}
err = store.TinyExtentRecover(uint64(localExtentInfo.FileID), int64(currFixOffset), int64(currRecoverySize), reply.Data, reply.CRC, isEmptyResponse)
if hasRecoverySize+currRecoverySize >= remoteAvaliSize {
log.LogInfof("streamRepairTinyExtent(%v) recover fininsh,remoteAvaliSize(%v) "+
"hasRecoverySize(%v) currRecoverySize(%v)", dp.applyRepairKey(int(localExtentInfo.FileID)),
remoteAvaliSize, hasRecoverySize+currRecoverySize, currRecoverySize)
break
}
} else {
log.LogDebugf("streamRepairExtent reply size %v, currFixoffset %v, reply %v ", reply.Size, currFixOffset, reply)
_, err = store.Write(uint64(localExtentInfo.FileID), int64(currFixOffset), int64(reply.Size), reply.Data, reply.CRC, wType, BufferWrite)
}
// log.LogDebugf("streamRepairExtent reply size %v, currFixoffset %v, reply %v err %v", reply.Size, currFixOffset, reply, err)
// write to the local extent file
if err != nil {
err = errors.Trace(err, "streamRepairExtent repair data error ")
return
}
hasRecoverySize += uint64(reply.Size)
currFixOffset += uint64(reply.Size)
if currFixOffset >= dstOffset {
log.LogWarnf(fmt.Sprintf("action[streamRepairExtent] fix(%v_%v) start fix from (%v)"+
" remoteSize(%v)localSize(%v) reply(%v).", dp.partitionID, localExtentInfo.FileID, remoteExtentInfo.String(),
dstOffset, currFixOffset, reply.GetUniqueLogId()))
break
}
}
return
}
// size difference between the local extent and the remote extent
var request *repl.Packet
sizeDiff := remoteExtentInfo.Size - localExtentInfo.Size
if storage.IsTinyExtent(remoteExtentInfo.FileID) {
if sizeDiff >= math.MaxUint32 {
sizeDiff = math.MaxUint32 - util.MB
}
request = repl.NewTinyExtentRepairReadPacket(dp.partitionID, remoteExtentInfo.FileID, int(localExtentInfo.Size), int(sizeDiff))
currFixOffset := localExtentInfo.Size
return doWork(0, currFixOffset, remoteExtentInfo.Size, request)
} else {
if sizeDiff > 0 {
log.LogDebugf("streamRepairExtent. local info %v, remote %v", localExtentInfo, remoteExtentInfo)
request = repl.NewExtentRepairReadPacket(dp.partitionID, remoteExtentInfo.FileID, int(localExtentInfo.Size), int(sizeDiff))
currFixOffset := localExtentInfo.Size
if err = doWork(storage.AppendWriteType, currFixOffset, remoteExtentInfo.Size, request); err != nil {
return
}
}
sizeDiffVerAppend := remoteExtentInfo.SnapshotDataOff - localExtentInfo.SnapshotDataOff
if sizeDiffVerAppend > 0 {
request = repl.NewExtentRepairReadPacket(dp.partitionID, remoteExtentInfo.FileID, int(localExtentInfo.SnapshotDataOff), int(sizeDiffVerAppend))
currFixOffset := localExtentInfo.SnapshotDataOff
return doWork(storage.AppendRandomWriteType, currFixOffset, remoteExtentInfo.SnapshotDataOff, request)
}
}
return
}
func intMin(a, b int) int {
if a < b {
return a
} else {
return b
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"context"
"fmt"
syslog "log"
"os"
"path"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"golang.org/x/time/rate"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/loadutil"
"github.com/cubefs/cubefs/util/log"
"github.com/shirou/gopsutil/disk"
)
var (
// RegexpDataPartitionDir validates the directory name of a data partition.
RegexpDataPartitionDir, _ = regexp.Compile(`^datapartition_(\d)+_(\d)+$`)
RegexpCachePartitionDir, _ = regexp.Compile(`^cachepartition_(\d)+_(\d)+$`)
RegexpPreLoadPartitionDir, _ = regexp.Compile(`^preloadpartition_(\d)+_(\d)+$`)
RegexpExpiredDataPartitionDir, _ = regexp.Compile(`^expired_datapartition_(\d)+_(\d)+$`)
)
const (
ExpiredPartitionPrefix = "expired_"
ExpiredPartitionExistTime = time.Hour * time.Duration(24*7)
)
const (
DecommissionDiskMark = "decommissionDiskMark"
)
// Disk represents the structure of the disk
type Disk struct {
sync.RWMutex
Path string
ReadErrCnt uint64 // number of read errors
WriteErrCnt uint64 // number of write errors
Total uint64
Used uint64
Available uint64
Unallocated uint64
Allocated uint64
MaxErrCnt int // maximum number of errors
Status int // disk status such as READONLY
ReservedSpace uint64
DiskRdonlySpace uint64
RejectWrite bool
partitionMap map[uint64]*DataPartition
syncTinyDeleteRecordFromLeaderOnEveryDisk chan bool
space *SpaceManager
dataNode *DataNode
limitFactor map[uint32]*rate.Limiter
limitRead *ioLimiter
limitWrite *ioLimiter
// diskPartition info
diskPartition *disk.PartitionStat
DiskErrPartitionSet map[uint64]struct{}
decommission bool
}
const (
SyncTinyDeleteRecordFromLeaderOnEveryDisk = 5
)
type PartitionVisitor func(dp *DataPartition)
func NewDisk(path string, reservedSpace, diskRdonlySpace uint64, maxErrCnt int, space *SpaceManager) (d *Disk, err error) {
d = new(Disk)
d.Path = path
d.ReservedSpace = reservedSpace
d.DiskRdonlySpace = diskRdonlySpace
d.MaxErrCnt = maxErrCnt
d.RejectWrite = false
d.space = space
d.dataNode = space.dataNode
d.partitionMap = make(map[uint64]*DataPartition)
d.syncTinyDeleteRecordFromLeaderOnEveryDisk = make(chan bool, SyncTinyDeleteRecordFromLeaderOnEveryDisk)
err = d.computeUsage()
if err != nil {
return nil, err
}
err = d.updateSpaceInfo()
if err != nil {
return nil, err
}
// get disk partition info
d.diskPartition, err = loadutil.GetMatchParation(d.Path)
if err != nil {
// log but let execution continue
log.LogErrorf("get partition info error, path is %v error message %v", d.Path, err.Error())
err = nil
}
d.startScheduleToUpdateSpaceInfo()
d.limitFactor = make(map[uint32]*rate.Limiter, 0)
d.limitFactor[proto.FlowReadType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxFLowLimit), proto.QosDefaultBurst)
d.limitFactor[proto.FlowWriteType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxFLowLimit), proto.QosDefaultBurst)
d.limitFactor[proto.IopsReadType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxIoLimit), defaultIOLimitBurst)
d.limitFactor[proto.IopsWriteType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxIoLimit), defaultIOLimitBurst)
d.limitRead = newIOLimiter(space.dataNode.diskReadFlow, space.dataNode.diskReadIocc)
d.limitWrite = newIOLimiter(space.dataNode.diskWriteFlow, space.dataNode.diskWriteIocc)
d.DiskErrPartitionSet = make(map[uint64]struct{}, 0)
err = d.initDecommissionStatus()
if err != nil {
log.LogErrorf("action[NewDisk]: failed to load disk decommission status")
// NOTE: continue execution
err = nil
}
return
}
func (d *Disk) MarkDecommissionStatus(decommission bool) {
probePath := path.Join(d.Path, DecommissionDiskMark)
var err error
defer func() {
if err != nil {
log.LogErrorf("action[MarkDecommissionStatus]: %v", err)
return
}
}()
if decommission {
file, err := os.Create(probePath)
if err == nil {
file.Close()
}
} else {
err = os.Remove(probePath)
if os.IsNotExist(err) {
err = nil
}
}
d.decommission = decommission
}
func (d *Disk) GetDecommissionStatus() bool {
return d.decommission
}
func (d *Disk) initDecommissionStatus() error {
probePath := path.Join(d.Path, DecommissionDiskMark)
_, err := os.Stat(probePath)
if err == nil {
d.decommission = true
return nil
}
if os.IsNotExist(err) {
return nil
}
return err
}
func (d *Disk) GetDiskPartition() *disk.PartitionStat {
return d.diskPartition
}
func (d *Disk) updateQosLimiter() {
if d.dataNode.diskReadFlow > 0 {
d.limitFactor[proto.FlowReadType].SetLimit(rate.Limit(d.dataNode.diskReadFlow))
}
if d.dataNode.diskWriteFlow > 0 {
d.limitFactor[proto.FlowWriteType].SetLimit(rate.Limit(d.dataNode.diskWriteFlow))
}
if d.dataNode.diskReadIops > 0 {
d.limitFactor[proto.IopsReadType].SetLimit(rate.Limit(d.dataNode.diskReadIops))
}
if d.dataNode.diskWriteIops > 0 {
d.limitFactor[proto.IopsWriteType].SetLimit(rate.Limit(d.dataNode.diskWriteIops))
}
for i := proto.IopsReadType; i < proto.FlowWriteType; i++ {
log.LogInfof("action[updateQosLimiter] type %v limit %v", proto.QosTypeString(i), d.limitFactor[i].Limit())
}
log.LogInfof("action[updateQosLimiter] read(iocc:%d iops:%d flow:%d) write(iocc:%d iops:%d flow:%d)",
d.dataNode.diskReadIocc, d.dataNode.diskReadIops, d.dataNode.diskReadFlow,
d.dataNode.diskWriteIocc, d.dataNode.diskWriteIops, d.dataNode.diskWriteFlow)
d.limitRead.ResetIO(d.dataNode.diskReadIocc)
d.limitRead.ResetFlow(d.dataNode.diskReadFlow)
d.limitWrite.ResetIO(d.dataNode.diskWriteIocc)
d.limitWrite.ResetFlow(d.dataNode.diskWriteFlow)
}
func (d *Disk) allocCheckLimit(factorType uint32, used uint32) error {
if !(d.dataNode.diskQosEnableFromMaster && d.dataNode.diskQosEnable) {
return nil
}
ctx := context.Background()
d.limitFactor[factorType].WaitN(ctx, int(used))
return nil
}
// PartitionCount returns the number of partitions in the partition map.
func (d *Disk) PartitionCount() int {
d.RLock()
defer d.RUnlock()
return len(d.partitionMap)
}
func (d *Disk) CanWrite() bool {
if d.Status == proto.ReadWrite || !d.RejectWrite {
return true
}
// if ReservedSpace < diskFreeSpace < DiskRdonlySpace, writeOp is ok, disk & dp is rdonly, can't create dp again
// if ReservedSpace > diskFreeSpace, writeOp is also not allowed.
if d.Total+d.DiskRdonlySpace > d.Used+d.ReservedSpace {
return true
}
return false
}
// Compute the disk usage
func (d *Disk) computeUsage() (err error) {
d.RLock()
defer d.RUnlock()
fs := syscall.Statfs_t{}
err = syscall.Statfs(d.Path, &fs)
if err != nil {
log.LogErrorf("computeUsage. err %v", err)
return
}
repairSize := uint64(d.repairAllocSize())
// total := math.Max(0, int64(fs.Blocks*uint64(fs.Bsize) - d.PreReserveSpace))
total := int64(fs.Blocks*uint64(fs.Bsize) - d.DiskRdonlySpace)
if total < 0 {
total = 0
}
d.Total = uint64(total)
// available := math.Max(0, int64(fs.Bavail*uint64(fs.Bsize) - d.PreReserveSpace))
available := int64(fs.Bavail*uint64(fs.Bsize) - d.DiskRdonlySpace - repairSize)
if available < 0 {
available = 0
}
d.Available = uint64(available)
// used := math.Max(0, int64(total - available))
free := int64(fs.Bfree*uint64(fs.Bsize) - d.DiskRdonlySpace - repairSize)
used := int64(total - free)
if used < 0 {
used = 0
}
d.Used = uint64(used)
allocatedSize := int64(0)
for _, dp := range d.partitionMap {
allocatedSize += int64(dp.Size())
}
log.LogDebugf("computeUsage. fs info [%v,%v,%v,%v] total %v available %v DiskRdonlySpace %v ReservedSpace %v allocatedSize %v",
fs.Blocks, fs.Bsize, fs.Bavail, fs.Bfree, d.Total, d.Available, d.DiskRdonlySpace, d.ReservedSpace, allocatedSize)
atomic.StoreUint64(&d.Allocated, uint64(allocatedSize))
// unallocated = math.Max(0, total - allocatedSize)
unallocated := total - allocatedSize
if unallocated < 0 {
unallocated = 0
}
if d.Available <= 0 {
d.RejectWrite = true
} else {
d.RejectWrite = false
}
d.Unallocated = uint64(unallocated)
log.LogDebugf("action[computeUsage] disk(%v) all(%v) available(%v) used(%v)", d.Path, d.Total, d.Available, d.Used)
return
}
func (d *Disk) repairAllocSize() int {
allocSize := 0
for _, dp := range d.partitionMap {
if dp.DataPartitionCreateType == proto.NormalCreateDataPartition || dp.leaderSize <= dp.used {
continue
}
allocSize += dp.leaderSize - dp.used
}
return allocSize
}
func (d *Disk) incReadErrCnt() {
atomic.AddUint64(&d.ReadErrCnt, 1)
}
func (d *Disk) getReadErrCnt() uint64 {
return atomic.LoadUint64(&d.ReadErrCnt)
}
func (d *Disk) incWriteErrCnt() {
atomic.AddUint64(&d.WriteErrCnt, 1)
}
func (d *Disk) getWriteErrCnt() uint64 {
return atomic.LoadUint64(&d.WriteErrCnt)
}
func (d *Disk) getTotalErrCnt() uint64 {
return d.getReadErrCnt() + d.getWriteErrCnt()
}
func (d *Disk) startScheduleToUpdateSpaceInfo() {
go func() {
updateSpaceInfoTicker := time.NewTicker(5 * time.Second)
checkStatusTicker := time.NewTicker(time.Minute * 2)
defer func() {
updateSpaceInfoTicker.Stop()
checkStatusTicker.Stop()
}()
for {
select {
case <-updateSpaceInfoTicker.C:
d.computeUsage()
d.updateSpaceInfo()
case <-checkStatusTicker.C:
d.checkDiskStatus()
}
}
}()
}
func (d *Disk) doBackendTask() {
for {
partitions := make([]*DataPartition, 0)
d.RLock()
for _, dp := range d.partitionMap {
partitions = append(partitions, dp)
}
d.RUnlock()
for _, dp := range partitions {
dp.extentStore.BackendTask()
}
time.Sleep(time.Minute)
}
}
const (
DiskStatusFile = ".diskStatus"
)
func (d *Disk) checkDiskStatus() {
if d.Status == proto.Unavailable {
log.LogInfof("[checkDiskStatus] disk status is unavailable, no need to check, disk path(%v)", d.Path)
return
}
path := path.Join(d.Path, DiskStatusFile)
fp, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0o755)
if err != nil {
d.CheckDiskError(err, ReadFlag)
return
}
defer fp.Close()
data := []byte(DiskStatusFile)
_, err = fp.WriteAt(data, 0)
if err != nil {
d.CheckDiskError(err, WriteFlag)
return
}
if err = fp.Sync(); err != nil {
d.CheckDiskError(err, WriteFlag)
return
}
if _, err = fp.ReadAt(data, 0); err != nil {
d.CheckDiskError(err, ReadFlag)
return
}
}
const DiskErrNotAssociatedWithPartition uint64 = 0 // use 0 for disk error without any data partition
func (d *Disk) CheckDiskError(err error, rwFlag uint8) {
if err == nil {
return
}
log.LogWarnf("CheckDiskError disk err: %v, disk:%v", err.Error(), d.Path)
if !IsDiskErr(err.Error()) {
return
}
d.triggerDiskError(rwFlag, DiskErrNotAssociatedWithPartition)
}
func (d *Disk) doDiskError() {
d.Status = proto.Unavailable
// d.ForceExitRaftStore()
}
func (d *Disk) triggerDiskError(rwFlag uint8, dpId uint64) {
mesg := fmt.Sprintf("disk path %v error on %v, dpId %v", d.Path, LocalIP, dpId)
exporter.Warning(mesg)
log.LogWarnf(mesg)
if rwFlag == WriteFlag {
d.incWriteErrCnt()
} else if rwFlag == ReadFlag {
d.incReadErrCnt()
} else {
d.incWriteErrCnt()
d.incReadErrCnt()
}
d.AddDiskErrPartition(dpId)
diskErrCnt := d.getTotalErrCnt()
diskErrPartitionCnt := d.GetDiskErrPartitionCount()
if diskErrPartitionCnt >= d.dataNode.diskUnavailablePartitionErrorCount {
msg := fmt.Sprintf("set disk unavailable for too many disk error, "+
"disk path(%v), ip(%v), diskErrCnt(%v), diskErrPartitionCnt(%v) threshold(%v)",
d.Path, LocalIP, diskErrCnt, diskErrPartitionCnt, d.dataNode.diskUnavailablePartitionErrorCount)
exporter.Warning(msg)
log.LogWarnf(msg)
d.doDiskError()
}
}
func (d *Disk) updateSpaceInfo() (err error) {
var statsInfo syscall.Statfs_t
if err = syscall.Statfs(d.Path, &statsInfo); err != nil {
d.incReadErrCnt()
}
if d.Status == proto.Unavailable {
mesg := fmt.Sprintf("disk path %v error on %v", d.Path, LocalIP)
log.LogErrorf(mesg)
exporter.Warning(mesg)
// d.ForceExitRaftStore()
} else if d.Available <= 0 {
d.Status = proto.ReadOnly
} else {
d.Status = proto.ReadWrite
}
log.LogDebugf("action[updateSpaceInfo] disk(%v) total(%v) available(%v) remain(%v) "+
"restSize(%v) preRestSize (%v) maxErrs(%v) readErrs(%v) writeErrs(%v) status(%v)", d.Path,
d.Total, d.Available, d.Unallocated, d.ReservedSpace, d.DiskRdonlySpace, d.MaxErrCnt, d.ReadErrCnt, d.WriteErrCnt, d.Status)
return
}
// AttachDataPartition adds a data partition to the partition map.
func (d *Disk) AttachDataPartition(dp *DataPartition) {
d.Lock()
d.partitionMap[dp.partitionID] = dp
d.Unlock()
d.computeUsage()
}
// DetachDataPartition removes a data partition from the partition map.
func (d *Disk) DetachDataPartition(dp *DataPartition) {
d.Lock()
delete(d.partitionMap, dp.partitionID)
delete(d.DiskErrPartitionSet, dp.partitionID)
d.Unlock()
d.computeUsage()
}
// GetDataPartition returns the data partition based on the given partition ID.
func (d *Disk) GetDataPartition(partitionID uint64) (partition *DataPartition) {
d.RLock()
defer d.RUnlock()
return d.partitionMap[partitionID]
}
func (d *Disk) GetDataPartitionCount() int {
d.RLock()
defer d.RUnlock()
return len(d.partitionMap)
}
func (d *Disk) ForceExitRaftStore() {
partitionList := d.DataPartitionList()
for _, partitionID := range partitionList {
partition := d.GetDataPartition(partitionID)
partition.partitionStatus = proto.Unavailable
partition.stopRaft()
}
}
// DataPartitionList returns a list of the data partitions
func (d *Disk) DataPartitionList() (partitionIDs []uint64) {
d.Lock()
defer d.Unlock()
partitionIDs = make([]uint64, 0, len(d.partitionMap))
for _, dp := range d.partitionMap {
partitionIDs = append(partitionIDs, dp.partitionID)
}
return
}
func unmarshalPartitionName(name string) (partitionID uint64, partitionSize int, err error) {
arr := strings.Split(name, "_")
if len(arr) != 3 {
err = fmt.Errorf("error DataPartition name(%v)", name)
return
}
if partitionID, err = strconv.ParseUint(arr[1], 10, 64); err != nil {
return
}
if partitionSize, err = strconv.Atoi(arr[2]); err != nil {
return
}
return
}
func (d *Disk) isPartitionDir(filename string) (isPartitionDir bool) {
isPartitionDir = RegexpDataPartitionDir.MatchString(filename) ||
RegexpCachePartitionDir.MatchString(filename) ||
RegexpPreLoadPartitionDir.MatchString(filename)
return
}
func (d *Disk) isExpiredPartitionDir(filename string) (isExpiredPartitionDir bool) {
isExpiredPartitionDir = RegexpExpiredDataPartitionDir.MatchString(filename)
return
}
// RestorePartition reads the files stored on the local disk and restores the data partitions.
func (d *Disk) RestorePartition(visitor PartitionVisitor) (err error) {
convert := func(node *proto.DataNodeInfo) *DataNodeInfo {
result := &DataNodeInfo{}
result.Addr = node.Addr
result.PersistenceDataPartitions = node.PersistenceDataPartitions
return result
}
var dataNode *proto.DataNodeInfo
for i := 0; i < 3; i++ {
dataNode, err = MasterClient.NodeAPI().GetDataNode(d.space.dataNode.localServerAddr)
if err != nil {
log.LogErrorf("action[RestorePartition]: getDataNode error %v", err)
continue
}
break
}
dinfo := convert(dataNode)
if len(dinfo.PersistenceDataPartitions) == 0 {
log.LogWarnf("action[RestorePartition]: length of PersistenceDataPartitions is 0, ExpiredPartition check " +
"without effect")
}
var (
partitionID uint64
partitionSize int
)
fileInfoList, err := os.ReadDir(d.Path)
if err != nil {
log.LogErrorf("action[RestorePartition] read dir(%v) err(%v).", d.Path, err)
return err
}
var (
wg sync.WaitGroup
toDeleteExpiredPartitionNames = make([]string, 0)
)
for _, fileInfo := range fileInfoList {
filename := fileInfo.Name()
if !d.isPartitionDir(filename) {
if d.isExpiredPartitionDir(filename) {
name := path.Join(d.Path, filename)
toDeleteExpiredPartitionNames = append(toDeleteExpiredPartitionNames, name)
log.LogInfof("action[RestorePartition] find expired partition on path(%s)", name)
}
continue
}
if partitionID, partitionSize, err = unmarshalPartitionName(filename); err != nil {
log.LogErrorf("action[RestorePartition] unmarshal partitionName(%v) from disk(%v) err(%v) ",
filename, d.Path, err.Error())
continue
}
log.LogDebugf("acton[RestorePartition] disk(%v) path(%v) PartitionID(%v) partitionSize(%v).",
d.Path, fileInfo.Name(), partitionID, partitionSize)
if isExpiredPartition(partitionID, dinfo.PersistenceDataPartitions) {
log.LogErrorf("action[RestorePartition]: find expired partition[%s], rename it and you can delete it "+
"manually", filename)
oldName := path.Join(d.Path, filename)
newName := path.Join(d.Path, ExpiredPartitionPrefix+filename)
os.Rename(oldName, newName)
toDeleteExpiredPartitionNames = append(toDeleteExpiredPartitionNames, newName)
continue
}
wg.Add(1)
go func(partitionID uint64, filename string) {
var (
dp *DataPartition
err error
)
defer wg.Done()
if dp, err = LoadDataPartition(path.Join(d.Path, filename), d); err != nil {
mesg := fmt.Sprintf("action[RestorePartition] new partition(%v) err(%v) ",
partitionID, err.Error())
log.LogError(mesg)
exporter.Warning(mesg)
syslog.Println(mesg)
return
}
if visitor != nil {
visitor(dp)
}
}(partitionID, filename)
}
if len(toDeleteExpiredPartitionNames) > 0 {
log.LogInfof("action[RestorePartition] expiredPartitions %v, disk %v", toDeleteExpiredPartitionNames, d.Path)
notDeletedExpiredPartitionNames := d.deleteExpiredPartitions(toDeleteExpiredPartitionNames)
if len(notDeletedExpiredPartitionNames) > 0 {
go func(toDeleteExpiredPartitions []string) {
ticker := time.NewTicker(ExpiredPartitionExistTime)
log.LogInfof("action[RestorePartition] delete expiredPartitions automatically start, toDeleteExpiredPartitions %v", toDeleteExpiredPartitions)
<-ticker.C
d.deleteExpiredPartitions(toDeleteExpiredPartitionNames)
ticker.Stop()
log.LogInfof("action[RestorePartition] delete expiredPartitions automatically finish")
}(notDeletedExpiredPartitionNames)
}
}
wg.Wait()
return err
}
func (d *Disk) deleteExpiredPartitions(toDeleteExpiredPartitionNames []string) (notDeletedExpiredPartitionNames []string) {
notDeletedExpiredPartitionNames = make([]string, 0)
for _, partitionName := range toDeleteExpiredPartitionNames {
dirName, fileName := path.Split(partitionName)
if !d.isExpiredPartitionDir(fileName) {
log.LogInfof("action[deleteExpiredPartitions] partition %v on %v is not expiredPartition", fileName, dirName)
continue
}
dirInfo, err := os.Stat(partitionName)
if err != nil {
log.LogErrorf("action[deleteExpiredPartitions] stat expiredPartition %v fail, err(%v)", partitionName, err)
continue
}
dirStat := dirInfo.Sys().(*syscall.Stat_t)
nowTime := time.Now().Unix()
expiredTime := dirStat.Ctim.Sec
if nowTime-expiredTime >= int64(ExpiredPartitionExistTime.Seconds()) {
err := os.RemoveAll(partitionName)
if err != nil {
log.LogErrorf("action[deleteExpiredPartitions] delete expiredPartition %v automatically fail, err(%v)", partitionName, err)
continue
}
log.LogInfof("action[deleteExpiredPartitions] delete expiredPartition %v automatically", partitionName)
} else {
notDeletedExpiredPartitionNames = append(notDeletedExpiredPartitionNames, partitionName)
}
}
return
}
func (d *Disk) AddSize(size uint64) {
atomic.AddUint64(&d.Allocated, size)
}
func (d *Disk) updateDisk(allocSize uint64) {
d.Lock()
defer d.Unlock()
if d.Available < allocSize {
d.Status = proto.ReadOnly
d.Available = 0
return
}
d.Available = d.Available - allocSize
}
func (d *Disk) getSelectWeight() float64 {
return float64(atomic.LoadUint64(&d.Allocated)) / float64(d.Total)
}
func (d *Disk) AddDiskErrPartition(dpId uint64) {
if _, ok := d.DiskErrPartitionSet[dpId]; !ok {
d.DiskErrPartitionSet[dpId] = struct{}{}
}
}
func (d *Disk) GetDiskErrPartitionList() (diskErrPartitionList []uint64) {
diskErrPartitionList = make([]uint64, 0)
for k := range d.DiskErrPartitionSet {
diskErrPartitionList = append(diskErrPartitionList, k)
}
return diskErrPartitionList
}
func (d *Disk) GetDiskErrPartitionCount() uint64 {
return uint64(len(d.DiskErrPartitionSet))
}
// isExpiredPartition return whether one partition is expired
// if one partition does not exist in master, we decided that it is one expired partition
func isExpiredPartition(id uint64, partitions []uint64) bool {
if len(partitions) == 0 {
return true
}
for _, existId := range partitions {
if existId == id {
return false
}
}
return true
}
//go:build gofuzz
// +build gofuzz
// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package datanode
import (
fuzz "github.com/AdaLogics/go-fuzz-headers"
)
type NewDiskParam struct {
Path string
ReservedSpace uint64
DiskRdonlySpace uint64
MaxErrCnt int
Space *SpaceManager
}
func FuzzNewDisk(data []byte) int {
f := fuzz.NewConsumer(data)
param := NewDiskParam{}
err := f.GenerateStruct(¶m)
if err != nil {
return 0
}
disk, err := NewDisk(param.Path, param.ReservedSpace, param.DiskRdonlySpace, param.MaxErrCnt, param.Space)
if disk == nil {
return 0
}
if err != nil {
return 0
}
return 1
}
package datanode
import (
"context"
"fmt"
"golang.org/x/time/rate"
)
var (
deleteLimiteRater = rate.NewLimiter(rate.Inf, defaultMarkDeleteLimitBurst)
MaxExtentRepairLimit = 20000
MinExtentRepairLimit = 5
CurExtentRepairLimit = MaxExtentRepairLimit
extentRepairLimitRater chan struct{}
)
func initRepairLimit() {
extentRepairLimitRater = make(chan struct{}, MaxExtentRepairLimit)
for i := 0; i < MaxExtentRepairLimit; i++ {
extentRepairLimitRater <- struct{}{}
}
}
func requestDoExtentRepair() (err error) {
err = fmt.Errorf("repair limit, cannot do extentRepair")
select {
case <-extentRepairLimitRater:
return nil
default:
return
}
}
func fininshDoExtentRepair() {
select {
case extentRepairLimitRater <- struct{}{}:
return
default:
return
}
}
func setDoExtentRepair(value int) {
if value <= 0 {
value = MaxExtentRepairLimit
}
if value > MaxExtentRepairLimit {
value = MaxExtentRepairLimit
}
if value < MinExtentRepairLimit {
value = MinExtentRepairLimit
}
if CurExtentRepairLimit != value {
CurExtentRepairLimit = value
close(extentRepairLimitRater)
extentRepairLimitRater = make(chan struct{}, CurExtentRepairLimit)
for i := 0; i < CurExtentRepairLimit; i++ {
extentRepairLimitRater <- struct{}{}
}
}
}
func DeleteLimiterWait() {
ctx := context.Background()
deleteLimiteRater.Wait(ctx)
}
func setLimiter(limiter *rate.Limiter, limitValue uint64) {
r := limitValue
l := rate.Limit(r)
if r == 0 {
l = rate.Inf
}
limiter.SetLimit(l)
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"context"
"math"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/util/log"
"golang.org/x/time/rate"
)
const minusOne = ^uint32(0)
type ioLimiter struct {
limit int
flow *rate.Limiter
io atomic.Value
}
type LimiterStatus struct {
FlowLimit int
FlowUsed int
IOConcurrency int
IOQueue int
IORunning int
IOWaiting int
}
// flow rate limiter's burst is double limit.
// max queue size of io is 8-times io concurrency.
func newIOLimiter(flowLimit, ioConcurrency int) *ioLimiter {
flow := rate.NewLimiter(rate.Inf, 0)
if flowLimit > 0 {
flow = rate.NewLimiter(rate.Limit(flowLimit), 2*flowLimit)
}
l := &ioLimiter{limit: flowLimit, flow: flow}
l.io.Store(newIOQueue(ioConcurrency))
return l
}
func (l *ioLimiter) getIO() *ioQueue {
return l.io.Load().(*ioQueue)
}
func (l *ioLimiter) ResetFlow(flowLimit int) {
l.limit = flowLimit
if flowLimit <= 0 {
l.flow.SetLimit(rate.Inf)
l.flow.SetBurst(0)
} else {
l.flow.SetLimit(rate.Limit(flowLimit))
l.flow.SetBurst(2 * flowLimit)
}
}
func (l *ioLimiter) ResetIO(ioConcurrency int) {
q := l.io.Swap(newIOQueue(ioConcurrency)).(*ioQueue)
q.Close()
}
func (l *ioLimiter) Run(size int, taskFn func()) {
if size > 0 {
if err := l.flow.WaitN(context.Background(), size); err != nil {
log.LogWarnf("action[limitio] run wait flow with %d %s", size, err.Error())
}
}
l.getIO().Run(taskFn)
}
func (l *ioLimiter) TryRun(size int, taskFn func()) bool {
if ok := l.getIO().TryRun(taskFn); !ok {
return false
}
if size > 0 {
if err := l.flow.WaitN(context.Background(), size); err != nil {
log.LogWarnf("action[limitio] tryrun wait flow with %d %s", size, err.Error())
return false
}
}
return true
}
func (l *ioLimiter) Status() (st LimiterStatus) {
st = l.getIO().Status()
limit := l.limit
st.FlowLimit = limit
if limit > 0 {
now := time.Now()
reserve := l.flow.ReserveN(now, l.flow.Burst())
duration := reserve.DelayFrom(now)
reserve.Cancel()
if ms := duration.Microseconds(); ms > 0 {
st.FlowUsed = int(math.Ceil(float64(limit) * (float64(ms) / 1e6)))
}
}
return
}
func (l *ioLimiter) Close() {
q := l.io.Swap(newIOQueue(0)).(*ioQueue)
q.Close()
}
type task struct {
fn func()
done chan struct{}
}
type ioQueue struct {
wg sync.WaitGroup
once sync.Once
running uint32
concurrency int
stopCh chan struct{}
queue chan *task
}
func newIOQueue(concurrency int) *ioQueue {
q := &ioQueue{concurrency: concurrency}
if q.concurrency <= 0 {
return q
}
q.stopCh = make(chan struct{})
q.queue = make(chan *task, 8*concurrency)
q.wg.Add(concurrency)
for ii := 0; ii < concurrency; ii++ {
go func() {
defer q.wg.Done()
for {
select {
case <-q.stopCh:
return
case task := <-q.queue:
atomic.AddUint32(&q.running, 1)
task.fn()
atomic.AddUint32(&q.running, minusOne)
close(task.done)
}
}
}()
}
return q
}
func (q *ioQueue) Run(taskFn func()) {
if q.concurrency <= 0 {
taskFn()
return
}
select {
case <-q.stopCh:
taskFn()
return
default:
}
task := &task{fn: taskFn, done: make(chan struct{})}
select {
case <-q.stopCh:
taskFn()
case q.queue <- task:
<-task.done
}
}
func (q *ioQueue) TryRun(taskFn func()) bool {
if q.concurrency <= 0 {
taskFn()
return true
}
select {
case <-q.stopCh:
taskFn()
return true
default:
}
task := &task{fn: taskFn, done: make(chan struct{})}
select {
case <-q.stopCh:
taskFn()
return true
case q.queue <- task:
<-task.done
return true
default:
return false
}
}
func (q *ioQueue) Status() (st LimiterStatus) {
st.IOConcurrency = q.concurrency
st.IOQueue = cap(q.queue)
st.IORunning = int(atomic.LoadUint32(&q.running))
st.IOWaiting = len(q.queue)
return
}
func (q *ioQueue) Close() {
q.once.Do(func() {
if q.concurrency > 0 {
close(q.stopCh)
}
})
q.wg.Wait()
// wait one minute if no task in the queue
// to protect task been blocked.
go func() {
waitTimer := time.NewTimer(time.Minute)
defer waitTimer.Stop()
for {
select {
case task := <-q.queue:
task.fn()
close(task.done)
waitTimer.Reset(time.Minute)
case <-waitTimer.C:
return
}
}
}()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"fmt"
"time"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
const (
StatPeriod = time.Minute * time.Duration(1)
MetricPartitionIOName = "dataPartitionIO"
MetricPartitionIOBytesName = "dataPartitionIOBytes"
MetricLackDpCount = "lackDataPartitionCount"
MetricCapacityToCreateDp = "capacityToCreateDp"
MetricConnectionCnt = "connectionCnt"
MetricDpCount = "dataPartitionCount"
MetricTotalDpSize = "totalDpSize"
MetricCapacity = "capacity"
)
type DataNodeMetrics struct {
dataNode *DataNode
stopC chan struct{}
MetricIOBytes *exporter.Counter
MetricLackDpCount *exporter.GaugeVec
MetricCapacityToCreateDp *exporter.GaugeVec
MetricConnectionCnt *exporter.Gauge
MetricDpCount *exporter.Gauge
MetricTotalDpSize *exporter.Gauge
MetricCapacity *exporter.GaugeVec
}
func (d *DataNode) registerMetrics() {
d.metrics = &DataNodeMetrics{
dataNode: d,
stopC: make(chan struct{}),
}
d.metrics.MetricIOBytes = exporter.NewCounter(MetricPartitionIOBytesName)
d.metrics.MetricLackDpCount = exporter.NewGaugeVec(MetricLackDpCount, "", []string{"type"})
d.metrics.MetricCapacityToCreateDp = exporter.NewGaugeVec(MetricCapacityToCreateDp, "", []string{"type"})
d.metrics.MetricConnectionCnt = exporter.NewGauge(MetricConnectionCnt)
d.metrics.MetricDpCount = exporter.NewGauge(MetricDpCount)
d.metrics.MetricTotalDpSize = exporter.NewGauge(MetricTotalDpSize)
d.metrics.MetricCapacity = exporter.NewGaugeVec(MetricCapacity, "", []string{"type"})
}
func (d *DataNode) startMetrics() {
go d.metrics.statMetrics()
log.LogInfof("startMetrics")
}
func (d *DataNode) closeMetrics() {
close(d.metrics.stopC)
log.LogInfof("closeMetrics")
}
func GetIoMetricLabels(partition *DataPartition, tp string) map[string]string {
labels := make(map[string]string)
labels[exporter.Vol] = partition.volumeID
labels[exporter.Type] = tp
labels[exporter.Disk] = partition.disk.Path
if exporter.EnablePid {
labels[exporter.PartId] = fmt.Sprintf("%d", partition.partitionID)
}
return labels
}
func (dm *DataNodeMetrics) statMetrics() {
ticker := time.NewTicker(StatPeriod)
for {
select {
case <-dm.stopC:
ticker.Stop()
log.LogInfof("stop metrics ticker")
return
case <-ticker.C:
dm.doStat()
}
}
}
func (dm *DataNodeMetrics) doStat() {
dm.setLackDpCountMetrics()
dm.setCapacityToCreateDpMetrics()
dm.setConnectionCntMetrics()
dm.setDpCountMetrics()
dm.setTotalDpSizeMetrics()
dm.setCapacityMetrics()
}
func (dm *DataNodeMetrics) setLackDpCountMetrics() {
lackPartitionsInMem := dm.dataNode.space.stats.LackPartitionsInMem
lackPartitionsInDisk := dm.dataNode.space.stats.LackPartitionsInDisk
dm.MetricLackDpCount.SetWithLabelValues(float64(lackPartitionsInMem), "inMemory")
dm.MetricLackDpCount.SetWithLabelValues(float64(lackPartitionsInDisk), "inDisk")
}
func (dm *DataNodeMetrics) setCapacityToCreateDpMetrics() {
remainingCapacityToCreateDp := dm.dataNode.space.stats.RemainingCapacityToCreatePartition
maxCapacityToCreateDp := dm.dataNode.space.stats.MaxCapacityToCreatePartition
dm.MetricCapacityToCreateDp.SetWithLabelValues(float64(remainingCapacityToCreateDp), "remaining")
dm.MetricCapacityToCreateDp.SetWithLabelValues(float64(maxCapacityToCreateDp), "max")
}
func (dm *DataNodeMetrics) setConnectionCntMetrics() {
connectionCnt := dm.dataNode.space.stats.ConnectionCnt
dm.MetricConnectionCnt.Set(float64(connectionCnt))
}
func (dm *DataNodeMetrics) setDpCountMetrics() {
dpCount := dm.dataNode.space.stats.CreatedPartitionCnt
dm.MetricDpCount.Set(float64(dpCount))
}
func (dm *DataNodeMetrics) setTotalDpSizeMetrics() {
totalDpSize := dm.dataNode.space.stats.TotalPartitionSize
dm.MetricTotalDpSize.Set(float64(totalDpSize))
}
func (dm *DataNodeMetrics) setCapacityMetrics() {
total := dm.dataNode.space.stats.Total
used := dm.dataNode.space.stats.Used
available := dm.dataNode.space.stats.Available
dm.MetricCapacity.SetWithLabelValues(float64(total), "total")
dm.MetricCapacity.SetWithLabelValues(float64(used), "used")
dm.MetricCapacity.SetWithLabelValues(float64(available), "available")
}
package datanode
import (
"sync/atomic"
"time"
"github.com/cubefs/cubefs/util/log"
"golang.org/x/time/rate"
)
const (
defaultMarkDeleteLimitRate = rate.Inf
defaultMarkDeleteLimitBurst = 512
defaultIOLimitBurst = 512
UpdateNodeInfoTicket = 1 * time.Minute
RepairTimeOut = time.Hour * 24
MaxRepairErrCnt = 1000
)
var nodeInfoStopC = make(chan struct{})
func (m *DataNode) startUpdateNodeInfo() {
ticker := time.NewTicker(UpdateNodeInfoTicket)
defer ticker.Stop()
for {
select {
case <-nodeInfoStopC:
log.LogInfo("datanode nodeinfo goroutine stopped")
return
case <-ticker.C:
m.updateNodeInfo()
}
}
}
func (m *DataNode) stopUpdateNodeInfo() {
nodeInfoStopC <- struct{}{}
}
func (m *DataNode) updateNodeInfo() {
clusterInfo, err := MasterClient.AdminAPI().GetClusterInfo()
if err != nil {
log.LogErrorf("[updateDataNodeInfo] %s", err.Error())
return
}
setLimiter(deleteLimiteRater, clusterInfo.DataNodeDeleteLimitRate)
setDoExtentRepair(int(clusterInfo.DataNodeAutoRepairLimitRate))
atomic.StoreUint64(&m.dpMaxRepairErrCnt, clusterInfo.DpMaxRepairErrCnt)
log.LogInfof("updateNodeInfo from master:"+
"deleteLimite(%v), autoRepairLimit(%v), dpMaxRepairErrCnt(%v)",
clusterInfo.DataNodeDeleteLimitRate, clusterInfo.DataNodeAutoRepairLimitRate,
clusterInfo.DpMaxRepairErrCnt)
}
func (m *DataNode) GetDpMaxRepairErrCnt() uint64 {
dpMaxRepairErrCnt := atomic.LoadUint64(&m.dpMaxRepairErrCnt)
if dpMaxRepairErrCnt == 0 {
return MaxRepairErrCnt
}
return dpMaxRepairErrCnt
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"encoding/json"
"fmt"
"hash/crc32"
"math"
"net"
"os"
"path"
"sort"
"strings"
"sync"
"sync/atomic"
"time"
raftProto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
DataPartitionPrefix = "datapartition"
CachePartitionPrefix = "cachepartition"
PreLoadPartitionPrefix = "preloadpartition"
DataPartitionMetadataFileName = "META"
TempMetadataFileName = ".meta"
ApplyIndexFile = "APPLY"
TempApplyIndexFile = ".apply"
TimeLayout = "2006-01-02 15:04:05"
)
const (
RaftStatusStopped = 0
RaftStatusRunning = 1
)
type DataPartitionMetadata struct {
VolumeID string
PartitionID uint64
PartitionSize int
PartitionType int
CreateTime string
Peers []proto.Peer
Hosts []string
DataPartitionCreateType int
LastTruncateID uint64
ReplicaNum int
StopRecover bool
VerList []*proto.VolVersionInfo
ApplyID uint64
}
func (md *DataPartitionMetadata) Validate() (err error) {
md.VolumeID = strings.TrimSpace(md.VolumeID)
if len(md.VolumeID) == 0 || md.PartitionID == 0 || md.PartitionSize == 0 {
err = errors.New("illegal data partition metadata")
return
}
return
}
// MetaMultiSnapshotInfo
type MetaMultiSnapshotInfo struct {
VerSeq uint64
Status int8
Ctime time.Time
}
type DataPartition struct {
clusterID string
volumeID string
partitionID uint64
partitionStatus int
partitionSize int
partitionType int
replicaNum int
replicas []string // addresses of the replicas
replicasLock sync.RWMutex
disk *Disk
dataNode *DataNode
isLeader bool
isRaftLeader bool
path string
used int
leaderSize int
extentStore *storage.ExtentStore
raftPartition raftstore.Partition
config *dataPartitionCfg
appliedID uint64 // apply id used in Raft
lastTruncateID uint64 // truncate id used in Raft
metaAppliedID uint64 // apply id while do meta persist
minAppliedID uint64
maxAppliedID uint64
stopOnce sync.Once
stopRaftC chan uint64
storeC chan uint64
stopC chan bool
raftStatus int32
intervalToUpdateReplicas int64 // interval to ask the master for updating the replica information
snapshot []*proto.File
snapshotMutex sync.RWMutex
intervalToUpdatePartitionSize int64
loadExtentHeaderStatus int
DataPartitionCreateType int
isLoadingDataPartition int32
persistMetaMutex sync.RWMutex
// snapshot
verSeq uint64
verSeqPrepare uint64
verSeqCommitStatus int8
volVersionInfoList *proto.VolVersionInfoList
decommissionRepairProgress float64 // record repair progress for decommission datapartition
stopRecover bool
recoverErrCnt uint64 // donot reset, if reach max err cnt, delete this dp
diskErrCnt uint64 // number of disk io errors while reading or writing
}
func (dp *DataPartition) IsForbidden() bool {
return dp.config.Forbidden
}
func (dp *DataPartition) SetForbidden(status bool) {
dp.config.Forbidden = status
}
func CreateDataPartition(dpCfg *dataPartitionCfg, disk *Disk, request *proto.CreateDataPartitionRequest) (dp *DataPartition, err error) {
if dp, err = newDataPartition(dpCfg, disk, true); err != nil {
return
}
dp.ForceLoadHeader()
if request.CreateType == proto.NormalCreateDataPartition {
err = dp.StartRaft(false)
} else {
// init leaderSize to partitionSize
disk.updateDisk(uint64(request.LeaderSize))
// ensure heartbeat report Recovering
dp.partitionStatus = proto.Recovering
go dp.StartRaftAfterRepair(false)
}
if err != nil {
return nil, err
}
// persist file metadata
go dp.StartRaftLoggingSchedule()
dp.DataPartitionCreateType = request.CreateType
dp.replicaNum = request.ReplicaNum
err = dp.PersistMetadata()
disk.AddSize(uint64(dp.Size()))
return
}
func (dp *DataPartition) IsEquareCreateDataPartitionRequst(request *proto.CreateDataPartitionRequest) (err error) {
if len(dp.config.Peers) != len(request.Members) {
return fmt.Errorf("exist partition(%v) peers len(%v) members len(%v)",
dp.partitionID, len(dp.config.Peers), len(request.Members))
}
for index, host := range dp.config.Hosts {
requestHost := request.Hosts[index]
if host != requestHost {
return fmt.Errorf("exist partition(%v) index(%v) requestHost(%v) persistHost(%v)",
dp.partitionID, index, requestHost, host)
}
}
for index, peer := range dp.config.Peers {
requestPeer := request.Members[index]
if requestPeer.ID != peer.ID || requestPeer.Addr != peer.Addr {
return fmt.Errorf("exist partition(%v) index(%v) requestPeer(%v) persistPeers(%v)",
dp.partitionID, index, requestPeer, peer)
}
}
if dp.config.VolName != request.VolumeId {
return fmt.Errorf("exist partition Partition(%v) requestVolName(%v) persistVolName(%v)",
dp.partitionID, request.VolumeId, dp.config.VolName)
}
return
}
func (dp *DataPartition) ForceSetDataPartitionToLoadding() {
atomic.StoreInt32(&dp.isLoadingDataPartition, 1)
}
func (dp *DataPartition) ForceSetDataPartitionToFininshLoad() {
atomic.StoreInt32(&dp.isLoadingDataPartition, 0)
}
func (dp *DataPartition) IsDataPartitionLoading() bool {
return atomic.LoadInt32(&dp.isLoadingDataPartition) == 1
}
func (dp *DataPartition) ForceSetRaftRunning() {
atomic.StoreInt32(&dp.raftStatus, RaftStatusRunning)
}
// LoadDataPartition loads and returns a partition instance based on the specified directory.
// It reads the partition metadata file stored under the specified directory
// and creates the partition instance.
func LoadDataPartition(partitionDir string, disk *Disk) (dp *DataPartition, err error) {
var metaFileData []byte
if metaFileData, err = os.ReadFile(path.Join(partitionDir, DataPartitionMetadataFileName)); err != nil {
return
}
meta := &DataPartitionMetadata{}
if err = json.Unmarshal(metaFileData, meta); err != nil {
return
}
if err = meta.Validate(); err != nil {
return
}
dpCfg := &dataPartitionCfg{
VolName: meta.VolumeID,
PartitionSize: meta.PartitionSize,
PartitionType: meta.PartitionType,
PartitionID: meta.PartitionID,
ReplicaNum: meta.ReplicaNum,
Peers: meta.Peers,
Hosts: meta.Hosts,
RaftStore: disk.space.GetRaftStore(),
NodeID: disk.space.GetNodeID(),
ClusterID: disk.space.GetClusterID(),
}
if dp, err = newDataPartition(dpCfg, disk, false); err != nil {
return
}
dp.stopRecover = meta.StopRecover
dp.metaAppliedID = meta.ApplyID
dp.computeUsage()
dp.ForceSetDataPartitionToLoadding()
disk.space.AttachPartition(dp)
if err = dp.LoadAppliedID(); err != nil {
log.LogErrorf("action[loadApplyIndex] %v", err)
return
}
log.LogInfof("Action(LoadDataPartition) PartitionID(%v) meta(%v) stopRecover(%v)", dp.partitionID, meta, meta.StopRecover)
dp.DataPartitionCreateType = meta.DataPartitionCreateType
dp.lastTruncateID = meta.LastTruncateID
if meta.DataPartitionCreateType == proto.NormalCreateDataPartition {
err = dp.StartRaft(true)
} else {
// init leaderSize to partitionSize
dp.leaderSize = dp.partitionSize
dp.partitionStatus = proto.Recovering
go dp.StartRaftAfterRepair(true)
}
if err != nil {
log.LogErrorf("PartitionID(%v) start raft err(%v)..", dp.partitionID, err)
disk.space.DetachDataPartition(dp.partitionID)
return
}
go dp.StartRaftLoggingSchedule()
disk.AddSize(uint64(dp.Size()))
dp.ForceLoadHeader()
return
}
func newDataPartition(dpCfg *dataPartitionCfg, disk *Disk, isCreate bool) (dp *DataPartition, err error) {
partitionID := dpCfg.PartitionID
var dataPath string
if proto.IsNormalDp(dpCfg.PartitionType) {
dataPath = path.Join(disk.Path, fmt.Sprintf(DataPartitionPrefix+"_%v_%v", partitionID, dpCfg.PartitionSize))
} else if proto.IsCacheDp(dpCfg.PartitionType) {
dataPath = path.Join(disk.Path, fmt.Sprintf(CachePartitionPrefix+"_%v_%v", partitionID, dpCfg.PartitionSize))
} else if proto.IsPreLoadDp(dpCfg.PartitionType) {
dataPath = path.Join(disk.Path, fmt.Sprintf(PreLoadPartitionPrefix+"_%v_%v", partitionID, dpCfg.PartitionSize))
} else {
return nil, fmt.Errorf("newDataPartition fail, dataPartitionCfg(%v)", dpCfg)
}
partition := &DataPartition{
volumeID: dpCfg.VolName,
clusterID: dpCfg.ClusterID,
partitionID: partitionID,
replicaNum: dpCfg.ReplicaNum,
disk: disk,
dataNode: disk.dataNode,
path: dataPath,
partitionSize: dpCfg.PartitionSize,
partitionType: dpCfg.PartitionType,
replicas: make([]string, 0),
stopC: make(chan bool),
stopRaftC: make(chan uint64),
storeC: make(chan uint64, 128),
snapshot: make([]*proto.File, 0),
partitionStatus: proto.ReadWrite,
config: dpCfg,
raftStatus: RaftStatusStopped,
verSeq: dpCfg.VerSeq,
DataPartitionCreateType: dpCfg.CreateType,
volVersionInfoList: &proto.VolVersionInfoList{},
}
atomic.StoreUint64(&partition.recoverErrCnt, 0)
log.LogInfof("action[newDataPartition] dp %v replica num %v", partitionID, dpCfg.ReplicaNum)
partition.replicasInit()
partition.extentStore, err = storage.NewExtentStore(partition.path, dpCfg.PartitionID, dpCfg.PartitionSize,
partition.partitionType, isCreate)
if err != nil {
log.LogWarnf("action[newDataPartition] dp %v NewExtentStore failed %v", partitionID, err.Error())
return
}
// store applyid
if err = partition.storeAppliedID(partition.appliedID); err != nil {
log.LogErrorf("action[newDataPartition] dp %v initial Apply [%v] failed: %v",
partition.partitionID, partition.appliedID, err)
return
}
disk.AttachDataPartition(partition)
dp = partition
go partition.statusUpdateScheduler()
go partition.startEvict()
if isCreate {
if err = dp.getVerListFromMaster(); err != nil {
log.LogErrorf("action[newDataPartition] vol %v dp %v loadFromMaster verList failed err %v", dp.volumeID, dp.partitionID, err)
return
}
}
log.LogInfof("action[newDataPartition] dp %v replica num %v CreateType %v create success",
dp.partitionID, dpCfg.ReplicaNum, dp.DataPartitionCreateType)
return
}
func (partition *DataPartition) HandleVersionOp(req *proto.MultiVersionOpRequest) (err error) {
var (
verData []byte
pItem *RaftCmdItem
)
if verData, err = json.Marshal(req); err != nil {
return
}
pItem = &RaftCmdItem{
Op: uint32(proto.OpVersionOp),
K: []byte("version"),
V: verData,
}
data, _ := MarshalRaftCmd(pItem)
_, err = partition.Submit(data)
return
}
func (partition *DataPartition) fsmVersionOp(opItem *RaftCmdItem) (err error) {
req := new(proto.MultiVersionOpRequest)
if err = json.Unmarshal(opItem.V, req); err != nil {
log.LogErrorf("action[fsmVersionOp] dp[%v] op item %v", partition.partitionID, opItem)
return
}
if len(req.VolVerList) == 0 {
return
}
lastSeq := req.VolVerList[len(req.VolVerList)-1].Ver
partition.volVersionInfoList.RWLock.Lock()
if len(partition.volVersionInfoList.VerList) == 0 {
partition.volVersionInfoList.VerList = make([]*proto.VolVersionInfo, len(req.VolVerList))
copy(partition.volVersionInfoList.VerList, req.VolVerList)
partition.verSeq = lastSeq
log.LogInfof("action[fsmVersionOp] dp %v seq %v updateVerList reqeust ver %v verlist %v dp verlist nil and set",
partition.partitionID, partition.verSeq, lastSeq, req.VolVerList)
partition.volVersionInfoList.RWLock.Unlock()
return
}
lastVerInfo := partition.volVersionInfoList.GetLastVolVerInfo()
log.LogInfof("action[fsmVersionOp] dp %v seq %v lastVerList seq %v req seq %v op %v",
partition.partitionID, partition.verSeq, lastVerInfo.Ver, lastSeq, req.Op)
if lastVerInfo.Ver >= lastSeq {
if lastVerInfo.Ver == lastSeq {
if req.Op == proto.CreateVersionCommit {
lastVerInfo.Status = proto.VersionNormal
}
}
partition.volVersionInfoList.RWLock.Unlock()
return
}
var status uint8 = proto.VersionPrepare
if req.Op == proto.CreateVersionCommit {
status = proto.VersionNormal
}
partition.volVersionInfoList.VerList = append(partition.volVersionInfoList.VerList, &proto.VolVersionInfo{
Status: status,
Ver: lastSeq,
})
partition.verSeq = lastSeq
err = partition.PersistMetadata()
log.LogInfof("action[fsmVersionOp] dp %v seq %v updateVerList reqeust add new seq %v verlist (%v) err (%v)",
partition.partitionID, partition.verSeq, lastSeq, partition.volVersionInfoList, err)
partition.volVersionInfoList.RWLock.Unlock()
return
}
func (dp *DataPartition) getVerListFromMaster() (err error) {
var verList *proto.VolVersionInfoList
verList, err = MasterClient.AdminAPI().GetVerList(dp.volumeID)
if err != nil {
log.LogErrorf("action[onStart] GetVerList err[%v]", err)
return
}
for _, info := range verList.VerList {
if info.Status != proto.VersionNormal {
continue
}
dp.volVersionInfoList.VerList = append(dp.volVersionInfoList.VerList, info)
}
log.LogDebugf("action[onStart] dp %v verList %v", dp.partitionID, dp.volVersionInfoList.VerList)
dp.verSeq = dp.volVersionInfoList.GetLastVer()
return
}
func (dp *DataPartition) replicasInit() {
replicas := make([]string, 0)
if dp.config.Hosts == nil {
return
}
replicas = append(replicas, dp.config.Hosts...)
dp.replicasLock.Lock()
dp.replicas = replicas
dp.replicasLock.Unlock()
if dp.config.Hosts != nil && len(dp.config.Hosts) >= 1 {
leaderAddr := strings.Split(dp.config.Hosts[0], ":")
if len(leaderAddr) == 2 && strings.TrimSpace(leaderAddr[0]) == LocalIP {
dp.isLeader = true
}
}
}
func (dp *DataPartition) GetExtentCount() int {
return dp.extentStore.GetExtentCount()
}
func (dp *DataPartition) Path() string {
return dp.path
}
// IsRaftLeader tells if the given address belongs to the raft leader.
func (dp *DataPartition) IsRaftLeader() (addr string, ok bool) {
if dp.raftStopped() {
return
}
leaderID, _ := dp.raftPartition.LeaderTerm()
if leaderID == 0 {
return
}
ok = leaderID == dp.config.NodeID
for _, peer := range dp.config.Peers {
if leaderID == peer.ID {
addr = peer.Addr
return
}
}
return
}
func (dp *DataPartition) Replicas() []string {
dp.replicasLock.RLock()
defer dp.replicasLock.RUnlock()
return dp.replicas
}
func (dp *DataPartition) getReplicaCopy() []string {
dp.replicasLock.RLock()
defer dp.replicasLock.RUnlock()
tmpCopy := make([]string, len(dp.replicas))
copy(tmpCopy, dp.replicas)
return tmpCopy
}
func (dp *DataPartition) getReplicaAddr(index int) string {
dp.replicasLock.RLock()
defer dp.replicasLock.RUnlock()
return dp.replicas[index]
}
func (dp *DataPartition) getReplicaLen() int {
dp.replicasLock.RLock()
defer dp.replicasLock.RUnlock()
return len(dp.replicas)
}
func (dp *DataPartition) IsExistReplica(addr string) bool {
dp.replicasLock.RLock()
defer dp.replicasLock.RUnlock()
for _, host := range dp.replicas {
if host == addr {
return true
}
}
return false
}
func (dp *DataPartition) ReloadSnapshot() {
files, err := dp.extentStore.SnapShot()
if err != nil {
log.LogErrorf("ReloadSnapshot err %v", err)
return
}
dp.snapshotMutex.Lock()
for _, f := range dp.snapshot {
storage.PutSnapShotFileToPool(f)
}
dp.snapshot = files
dp.snapshotMutex.Unlock()
}
// Snapshot returns the snapshot of the data partition.
func (dp *DataPartition) SnapShot() (files []*proto.File) {
dp.snapshotMutex.RLock()
defer dp.snapshotMutex.RUnlock()
return dp.snapshot
}
// Stop close the store and the raft store.
func (dp *DataPartition) Stop() {
dp.stopOnce.Do(func() {
if dp.stopC != nil {
close(dp.stopC)
}
// Close the store and raftstore.
dp.stopRaft()
dp.extentStore.Close()
err := dp.storeAppliedID(atomic.LoadUint64(&dp.appliedID))
if err != nil {
log.LogErrorf("action[Stop]: failed to store applied index")
}
})
}
// Disk returns the disk instance.
func (dp *DataPartition) Disk() *Disk {
return dp.disk
}
// func (dp *DataPartition) IsRejectWrite() bool {
// return dp.Disk().RejectWrite
// }
// Status returns the partition status.
func (dp *DataPartition) Status() int {
return dp.partitionStatus
}
// Size returns the partition size.
func (dp *DataPartition) Size() int {
return dp.partitionSize
}
// Used returns the used space.
func (dp *DataPartition) Used() int {
return dp.used
}
// Available returns the available space.
func (dp *DataPartition) Available() int {
return dp.partitionSize - dp.used
}
func (dp *DataPartition) ForceLoadHeader() {
dp.loadExtentHeaderStatus = FinishLoadDataPartitionExtentHeader
}
// PersistMetadata persists the file metadata on the disk.
func (dp *DataPartition) PersistMetadata() (err error) {
dp.persistMetaMutex.Lock()
defer dp.persistMetaMutex.Unlock()
var (
metadataFile *os.File
metaData []byte
)
fileName := path.Join(dp.Path(), TempMetadataFileName)
if metadataFile, err = os.OpenFile(fileName, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
return
}
defer func() {
metadataFile.Sync()
metadataFile.Close()
os.Remove(fileName)
}()
md := &DataPartitionMetadata{
VolumeID: dp.config.VolName,
PartitionID: dp.config.PartitionID,
ReplicaNum: dp.config.ReplicaNum,
PartitionSize: dp.config.PartitionSize,
PartitionType: dp.config.PartitionType,
Peers: dp.config.Peers,
Hosts: dp.config.Hosts,
DataPartitionCreateType: dp.DataPartitionCreateType,
CreateTime: time.Now().Format(TimeLayout),
LastTruncateID: dp.lastTruncateID,
StopRecover: dp.stopRecover,
VerList: dp.volVersionInfoList.VerList,
ApplyID: dp.appliedID,
}
if metaData, err = json.Marshal(md); err != nil {
return
}
if _, err = metadataFile.Write(metaData); err != nil {
return
}
dp.metaAppliedID = dp.appliedID
log.LogInfof("PersistMetadata DataPartition(%v) data(%v)", dp.partitionID, string(metaData))
err = os.Rename(fileName, path.Join(dp.Path(), DataPartitionMetadataFileName))
return
}
func (dp *DataPartition) statusUpdateScheduler() {
ticker := time.NewTicker(time.Minute)
snapshotTicker := time.NewTicker(time.Minute * 5)
var index int
for {
select {
case <-ticker.C:
dp.statusUpdate()
// only repair tiny extent
if !dp.isNormalType() {
dp.LaunchRepair(proto.TinyExtentType)
continue
}
index++
if index >= math.MaxUint32 {
index = 0
}
if index%2 == 0 {
dp.LaunchRepair(proto.TinyExtentType)
} else {
dp.LaunchRepair(proto.NormalExtentType)
}
case <-snapshotTicker.C:
dp.ReloadSnapshot()
case <-dp.stopC:
ticker.Stop()
snapshotTicker.Stop()
return
}
}
}
func (dp *DataPartition) statusUpdate() {
status := proto.ReadWrite
dp.computeUsage()
if dp.used >= dp.partitionSize {
status = proto.ReadOnly
}
if dp.isNormalType() && dp.extentStore.GetExtentCount() >= storage.MaxExtentCount {
status = proto.ReadOnly
}
if dp.isNormalType() && dp.raftStatus == RaftStatusStopped {
// dp is still recovering
if dp.DataPartitionCreateType == proto.DecommissionedCreateDataPartition {
status = proto.Recovering
} else {
status = proto.Unavailable
}
}
if dp.getDiskErrCnt() > 0 {
dp.partitionStatus = proto.Unavailable
}
log.LogInfof("action[statusUpdate] dp %v raft status %v dp.status %v, status %v, disk status %v",
dp.partitionID, dp.raftStatus, dp.Status(), status, float64(dp.disk.Status))
// dp.partitionStatus = int(math.Min(float64(status), float64(dp.disk.Status)))
dp.partitionStatus = status
}
func (dp *DataPartition) computeUsage() {
if time.Now().Unix()-dp.intervalToUpdatePartitionSize < IntervalToUpdatePartitionSize {
return
}
dp.used = int(dp.ExtentStore().GetStoreUsedSize())
dp.intervalToUpdatePartitionSize = time.Now().Unix()
}
func (dp *DataPartition) ExtentStore() *storage.ExtentStore {
return dp.extentStore
}
func (dp *DataPartition) checkIsDiskError(err error, rwFlag uint8) {
if err == nil {
return
}
log.LogWarnf("checkIsDiskError: disk path %v, error: %v, partition:%v, rwFlag:%v",
dp.Path(), err.Error(), dp.partitionID, rwFlag)
if !IsDiskErr(err.Error()) {
return
}
dp.stopRaft()
dp.incDiskErrCnt()
dp.disk.triggerDiskError(rwFlag, dp.partitionID)
// must after change disk.status
dp.statusUpdate()
return
}
func newRaftApplyError(err error) error {
return errors.NewErrorf("[Custom Error]: unhandled raft apply error, err(%s)", err)
}
func isRaftApplyError(errMsg string) bool {
return strings.Contains(errMsg, "[Custom Error]: unhandled raft apply error")
}
// String returns the string format of the data partition information.
func (dp *DataPartition) String() (m string) {
return fmt.Sprintf(DataPartitionPrefix+"_%v_%v", dp.partitionID, dp.partitionSize)
}
// LaunchRepair launches the repair of extents.
func (dp *DataPartition) LaunchRepair(extentType uint8) {
if dp.partitionStatus == proto.Unavailable {
return
}
if err := dp.updateReplicas(false); err != nil {
log.LogErrorf("action[LaunchRepair] partition(%v) err(%v).", dp.partitionID, err)
return
}
if !dp.isLeader {
return
}
if dp.extentStore.BrokenTinyExtentCnt() == 0 {
dp.extentStore.MoveAllToBrokenTinyExtentC(MinTinyExtentsToRepair)
}
dp.repair(extentType)
}
func (dp *DataPartition) updateReplicas(isForce bool) (err error) {
if !isForce && time.Now().Unix()-dp.intervalToUpdateReplicas <= IntervalToUpdateReplica {
return
}
dp.isLeader = false
isLeader, replicas, err := dp.fetchReplicasFromMaster()
if err != nil {
return
}
dp.replicasLock.Lock()
defer dp.replicasLock.Unlock()
if !dp.compareReplicas(dp.replicas, replicas) {
log.LogInfof("action[updateReplicas] partition(%v) replicas changed from (%v) to (%v).",
dp.partitionID, dp.replicas, replicas)
}
dp.isLeader = isLeader
dp.replicas = replicas
dp.intervalToUpdateReplicas = time.Now().Unix()
log.LogInfof(fmt.Sprintf("ActionUpdateReplicationHosts partiton(%v), force(%v)", dp.partitionID, isForce))
return
}
// Compare the fetched replica with the local one.
func (dp *DataPartition) compareReplicas(v1, v2 []string) (equals bool) {
if len(v1) == len(v2) {
for i := 0; i < len(v1); i++ {
if v1[i] != v2[i] {
return false
}
}
return true
}
return false
}
// Fetch the replica information from the master.
func (dp *DataPartition) fetchReplicasFromMaster() (isLeader bool, replicas []string, err error) {
var partition *proto.DataPartitionInfo
retry := 0
for {
if partition, err = MasterClient.AdminAPI().GetDataPartition(dp.volumeID, dp.partitionID); err != nil {
retry++
if retry > 5 {
isLeader = false
return
}
} else {
break
}
time.Sleep(10 * time.Second)
}
replicas = append(replicas, partition.Hosts...)
if partition.Hosts != nil && len(partition.Hosts) >= 1 {
leaderAddr := strings.Split(partition.Hosts[0], ":")
if len(leaderAddr) == 2 && strings.TrimSpace(leaderAddr[0]) == LocalIP {
isLeader = true
}
}
return
}
func (dp *DataPartition) Load() (response *proto.LoadDataPartitionResponse) {
response = &proto.LoadDataPartitionResponse{}
response.PartitionId = uint64(dp.partitionID)
response.PartitionStatus = dp.partitionStatus
response.Used = uint64(dp.Used())
var err error
if dp.loadExtentHeaderStatus != FinishLoadDataPartitionExtentHeader {
response.PartitionSnapshot = make([]*proto.File, 0)
} else {
response.PartitionSnapshot = dp.SnapShot()
}
if err != nil {
response.Status = proto.TaskFailed
response.Result = err.Error()
return
}
return
}
// DoExtentStoreRepair performs the repairs of the extent store.
// 1. when the extent size is smaller than the max size on the record, start to repair the missing part.
// 2. if the extent does not even exist, create the extent first, and then repair.
func (dp *DataPartition) DoExtentStoreRepair(repairTask *DataPartitionRepairTask) {
if dp.stopRecover && dp.isDecommissionRecovering() {
log.LogWarnf("DoExtentStoreRepair %v receive stop signal", dp.partitionID)
return
}
store := dp.extentStore
log.LogDebugf("DoExtentStoreRepair.dp %v len extents %v", dp.partitionID, len(repairTask.ExtentsToBeCreated))
for _, extentInfo := range repairTask.ExtentsToBeCreated {
log.LogDebugf("DoExtentStoreRepair.dp %v len extentInfo %v", dp.partitionID, extentInfo)
if storage.IsTinyExtent(extentInfo.FileID) {
continue
}
if store.HasExtent(uint64(extentInfo.FileID)) {
continue
}
if !AutoRepairStatus {
log.LogWarnf("AutoRepairStatus is False,so cannot Create extent(%v)", extentInfo.String())
continue
}
dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
err := store.Create(uint64(extentInfo.FileID))
if err != nil {
continue
}
}
var (
wg *sync.WaitGroup
recoverIndex int
)
wg = new(sync.WaitGroup)
for _, extentInfo := range repairTask.ExtentsToBeRepaired {
if dp.stopRecover && dp.isDecommissionRecovering() {
log.LogWarnf("DoExtentStoreRepair %v receive stop signal", dp.partitionID)
return
}
if !store.HasExtent(uint64(extentInfo.FileID)) {
continue
}
wg.Add(1)
// repair the extents
go dp.doStreamExtentFixRepair(wg, extentInfo)
recoverIndex++
if recoverIndex%NumOfFilesToRecoverInParallel == 0 {
wg.Wait()
}
}
wg.Wait()
dp.doStreamFixTinyDeleteRecord(repairTask)
}
func (dp *DataPartition) pushSyncDeleteRecordFromLeaderMesg() bool {
select {
case dp.Disk().syncTinyDeleteRecordFromLeaderOnEveryDisk <- true:
return true
default:
return false
}
}
func (dp *DataPartition) consumeTinyDeleteRecordFromLeaderMesg() {
select {
case <-dp.Disk().syncTinyDeleteRecordFromLeaderOnEveryDisk:
return
default:
return
}
}
func (dp *DataPartition) doStreamFixTinyDeleteRecord(repairTask *DataPartitionRepairTask) {
var (
localTinyDeleteFileSize int64
err error
conn net.Conn
)
if !dp.pushSyncDeleteRecordFromLeaderMesg() {
return
}
defer func() {
dp.consumeTinyDeleteRecordFromLeaderMesg()
}()
if localTinyDeleteFileSize, err = dp.extentStore.LoadTinyDeleteFileOffset(); err != nil {
return
}
log.LogInfof(ActionSyncTinyDeleteRecord+" start PartitionID(%v) localTinyDeleteFileSize(%v) leaderTinyDeleteFileSize(%v) leaderAddr(%v)",
dp.partitionID, localTinyDeleteFileSize, repairTask.LeaderTinyDeleteRecordFileSize, repairTask.LeaderAddr)
if localTinyDeleteFileSize >= repairTask.LeaderTinyDeleteRecordFileSize {
return
}
if repairTask.LeaderTinyDeleteRecordFileSize-localTinyDeleteFileSize < MinTinyExtentDeleteRecordSyncSize {
return
}
defer func() {
log.LogInfof(ActionSyncTinyDeleteRecord+" end PartitionID(%v) localTinyDeleteFileSize(%v) leaderTinyDeleteFileSize(%v) leaderAddr(%v) err(%v)",
dp.partitionID, localTinyDeleteFileSize, repairTask.LeaderTinyDeleteRecordFileSize, repairTask.LeaderAddr, err)
}()
p := repl.NewPacketToReadTinyDeleteRecord(dp.partitionID, localTinyDeleteFileSize)
if conn, err = dp.getRepairConn(repairTask.LeaderAddr); err != nil {
return
}
defer func() {
dp.putRepairConn(conn, err != nil)
}()
if err = p.WriteToConn(conn); err != nil {
return
}
store := dp.extentStore
start := time.Now().Unix()
for localTinyDeleteFileSize < repairTask.LeaderTinyDeleteRecordFileSize {
if dp.stopRecover && dp.isDecommissionRecovering() {
log.LogWarnf("doStreamFixTinyDeleteRecord %v receive stop signal", dp.partitionID)
return
}
if localTinyDeleteFileSize >= repairTask.LeaderTinyDeleteRecordFileSize {
return
}
if err = p.ReadFromConnWithVer(conn, proto.ReadDeadlineTime); err != nil {
return
}
if p.IsErrPacket() {
logContent := fmt.Sprintf("action[doStreamFixTinyDeleteRecord] %v.",
p.LogMessage(p.GetOpMsg(), conn.RemoteAddr().String(), start, fmt.Errorf(string(p.Data[:p.Size]))))
err = fmt.Errorf(logContent)
return
}
if p.CRC != crc32.ChecksumIEEE(p.Data[:p.Size]) {
err = fmt.Errorf("crc not match")
return
}
if p.Size%storage.DeleteTinyRecordSize != 0 {
err = fmt.Errorf("unavali size")
return
}
var index int
for (index+1)*storage.DeleteTinyRecordSize <= int(p.Size) {
record := p.Data[index*storage.DeleteTinyRecordSize : (index+1)*storage.DeleteTinyRecordSize]
extentID, offset, size := storage.UnMarshalTinyExtent(record)
localTinyDeleteFileSize += storage.DeleteTinyRecordSize
index++
if !storage.IsTinyExtent(extentID) {
continue
}
DeleteLimiterWait()
dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
// log.LogInfof("doStreamFixTinyDeleteRecord Delete PartitionID(%v)_Extent(%v)_Offset(%v)_Size(%v)", dp.partitionID, extentID, offset, size)
store.MarkDelete(extentID, int64(offset), int64(size))
}
}
}
// ChangeRaftMember is a wrapper function of changing the raft member.
func (dp *DataPartition) ChangeRaftMember(changeType raftProto.ConfChangeType, peer raftProto.Peer, context []byte) (resp interface{}, err error) {
resp, err = dp.raftPartition.ChangeMember(changeType, peer, context)
return
}
func (dp *DataPartition) canRemoveSelf() (canRemove bool, err error) {
var partition *proto.DataPartitionInfo
retry := 0
for {
if partition, err = MasterClient.AdminAPI().GetDataPartition(dp.volumeID, dp.partitionID); err != nil {
log.LogErrorf("action[canRemoveSelf] err[%v]", err)
retry++
if retry > 60 {
return
}
} else {
break
}
time.Sleep(10 * time.Second)
}
canRemove = false
var existInPeers bool
for _, peer := range partition.Peers {
if dp.config.NodeID == peer.ID {
existInPeers = true
}
}
if !existInPeers {
canRemove = true
return
}
if dp.config.NodeID == partition.OfflinePeerID {
canRemove = true
return
}
return
}
func (dp *DataPartition) getRepairConn(target string) (net.Conn, error) {
return dp.dataNode.getRepairConnFunc(target)
}
func (dp *DataPartition) putRepairConn(conn net.Conn, forceClose bool) {
log.LogDebugf("action[putRepairConn], forceClose: %v", forceClose)
dp.dataNode.putRepairConnFunc(conn, forceClose)
}
func (dp *DataPartition) isNormalType() bool {
return proto.IsNormalDp(dp.partitionType)
}
type SimpleVolView struct {
vv *proto.SimpleVolView
lastUpdateTime time.Time
}
type VolMap struct {
sync.Mutex
volMap map[string]*SimpleVolView
}
var volViews = VolMap{
Mutex: sync.Mutex{},
volMap: make(map[string]*SimpleVolView),
}
func (vo *VolMap) getSimpleVolView(VolumeID string) (vv *proto.SimpleVolView, err error) {
vo.Lock()
if volView, ok := vo.volMap[VolumeID]; ok && time.Since(volView.lastUpdateTime) < 5*time.Minute {
vo.Unlock()
return volView.vv, nil
}
vo.Unlock()
volView := &SimpleVolView{
vv: nil,
lastUpdateTime: time.Time{},
}
if vv, err = MasterClient.AdminAPI().GetVolumeSimpleInfo(VolumeID); err != nil {
log.LogErrorf("action[GetVolumeSimpleInfo] cannot get vol(%v) from master(%v) err(%v).",
VolumeID, MasterClient.Leader(), err)
return nil, err
}
log.LogDebugf("get volume info, vol(%s), vol(%v)", vv.Name, volView)
volView.vv = vv
volView.lastUpdateTime = time.Now()
vo.Lock()
vo.volMap[VolumeID] = volView
vo.Unlock()
return
}
func (dp *DataPartition) doExtentTtl(ttl int) {
if ttl <= 0 {
log.LogWarn("[doTTL] ttl is 0, set default 30", ttl)
ttl = 30
}
extents := dp.extentStore.DumpExtents()
for _, ext := range extents {
if storage.IsTinyExtent(ext.FileID) {
continue
}
if time.Now().Unix()-ext.AccessTime > int64(ttl)*util.OneDaySec() {
log.LogDebugf("action[doExtentTtl] ttl delete dp(%v) extent(%v).", dp.partitionID, ext)
dp.extentStore.MarkDelete(ext.FileID, 0, 0)
}
}
}
func (dp *DataPartition) doExtentEvict(vv *proto.SimpleVolView) {
var (
needDieOut bool
freeSpace int
freeExtentCount int
)
needDieOut = false
if vv.CacheHighWater < vv.CacheLowWater || vv.CacheLowWater < 0 || vv.CacheHighWater > 100 {
log.LogErrorf("action[doExtentEvict] invalid policy dp(%v), CacheHighWater(%v) CacheLowWater(%v).",
dp.partitionID, vv.CacheHighWater, vv.CacheLowWater)
return
}
// if dp use age larger than the space high water, do die out.
freeSpace = 0
if dp.Used()*100/dp.Size() > vv.CacheHighWater {
needDieOut = true
freeSpace = dp.Used() - dp.Size()*vv.CacheLowWater/100
} else if dp.partitionStatus == proto.ReadOnly {
needDieOut = true
freeSpace = dp.Used() * (vv.CacheHighWater - vv.CacheLowWater) / 100
}
// if dp extent count larger than upper count, do die out.
freeExtentCount = 0
extInfos := dp.extentStore.DumpExtents()
maxExtentCount := dp.Size() / util.DefaultTinySizeLimit
if len(extInfos) > maxExtentCount {
needDieOut = true
freeExtentCount = len(extInfos) - vv.CacheLowWater*maxExtentCount/100
}
log.LogDebugf("action[doExtentEvict], vol %v, LRU(%v, %v), dp %v, usage %v, status(%d), extents %v, freeSpace %v, freeExtentCount %v, needDieOut %v",
vv.Name, vv.CacheLowWater, vv.CacheHighWater, dp.partitionID, dp.Used()*100/dp.Size(), dp.partitionStatus, len(extInfos),
freeSpace, freeExtentCount, needDieOut)
if !needDieOut {
return
}
sort.Sort(extInfos)
for _, ext := range extInfos {
if storage.IsTinyExtent(ext.FileID) {
continue
}
freeSpace -= int(ext.Size)
freeExtentCount--
dp.extentStore.MarkDelete(ext.FileID, 0, 0)
log.LogDebugf("action[doExtentEvict] die out. vol %v, dp(%v), extent(%v).", vv.Name, dp.partitionID, *ext)
if freeSpace <= 0 && freeExtentCount <= 0 {
log.LogDebugf("[doExtentEvict] die out done, vol(%s), dp (%d)", vv.Name, dp.partitionID)
break
}
}
}
func (dp *DataPartition) startEvict() {
// only cache or preload dp can't do evict.
if !proto.IsCacheDp(dp.partitionType) {
return
}
log.LogDebugf("[startEvict] start do dp(%d) evict op", dp.partitionID)
vv, err := volViews.getSimpleVolView(dp.volumeID)
if err != nil {
err := fmt.Errorf("[startEvict] get vol [%s] info error, err %s", dp.volumeID, err.Error())
log.LogError(err)
panic(err)
}
lruInterval := getWithDefault(vv.CacheLruInterval, 5)
cacheTtl := getWithDefault(vv.CacheTtl, 30)
lruTimer := time.NewTicker(time.Duration(lruInterval) * time.Minute)
ttlTimer := time.NewTicker(time.Duration(util.OneDaySec()) * time.Second)
defer func() {
lruTimer.Stop()
ttlTimer.Stop()
}()
for {
// check volume type and dp type.
if proto.IsHot(vv.VolType) || !proto.IsCacheDp(dp.partitionType) {
log.LogErrorf("action[startEvict] cannot startEvict, vol(%v), dp(%v).", vv.Name, dp.partitionID)
return
}
select {
case <-lruTimer.C:
log.LogDebugf("start [doExtentEvict] vol(%s), dp(%d).", vv.Name, dp.partitionID)
evictStart := time.Now()
dp.doExtentEvict(vv)
log.LogDebugf("action[doExtentEvict] vol(%v), dp(%v), cost (%v)ms, .", vv.Name, dp.partitionID, time.Since(evictStart))
case <-ttlTimer.C:
log.LogDebugf("start [doExtentTtl] vol(%s), dp(%d).", vv.Name, dp.partitionID)
ttlStart := time.Now()
dp.doExtentTtl(cacheTtl)
log.LogDebugf("action[doExtentTtl] vol(%v), dp(%v), cost (%v)ms.", vv.Name, dp.partitionID, time.Since(ttlStart))
case <-dp.stopC:
log.LogWarn("task[doExtentTtl] stopped", dp.volumeID, dp.partitionID)
return
}
// loop update vol info
newVV, err := volViews.getSimpleVolView(dp.volumeID)
if err != nil {
err := fmt.Errorf("[startEvict] get vol [%s] info error, err %s", dp.volumeID, err.Error())
log.LogError(err)
continue
}
vv = newVV
if lruInterval != vv.CacheLruInterval || cacheTtl != vv.CacheTtl {
lruInterval = getWithDefault(vv.CacheLruInterval, 5)
cacheTtl = getWithDefault(vv.CacheTtl, 30)
lruTimer = time.NewTicker(time.Duration(lruInterval) * time.Minute)
log.LogInfof("[startEvict] update vol config, dp(%d) %v ", dp.partitionID, *vv)
}
}
}
func getWithDefault(base, def int) int {
if base <= 0 {
return def
}
return base
}
func (dp *DataPartition) StopDecommissionRecover(stop bool) {
// only work for decommission repair
if !dp.isDecommissionRecovering() {
log.LogWarnf("[StopDecommissionRecover] dp(%d) is not in recovering status: type %d status %d",
dp.partitionID, dp.partitionType, dp.Status())
return
}
// for check timeout
dp.stopRecover = stop
dp.PersistMetadata()
}
func (dp *DataPartition) isDecommissionRecovering() bool {
// decommission recover failed or success will set to normal
return dp.DataPartitionCreateType == proto.DecommissionedCreateDataPartition
}
func (dp *DataPartition) handleDecommissionRecoverFailed() {
if !dp.isDecommissionRecovering() {
return
}
// prevent status changing from Unavailable to Recovering again in statusUpdate()
dp.partitionType = proto.NormalCreateDataPartition
dp.partitionStatus = proto.Unavailable
log.LogWarnf("[handleDecommissionRecoverFailed] dp(%d) recover failed reach max limit", dp.partitionID)
dp.PersistMetadata()
dp.StopDecommissionRecover(true)
}
func (dp *DataPartition) incDiskErrCnt() {
diskErrCnt := atomic.AddUint64(&dp.diskErrCnt, 1)
log.LogWarnf("[incDiskErrCnt]: dp(%v) disk err count:%v", dp.partitionID, diskErrCnt)
}
func (dp *DataPartition) getDiskErrCnt() uint64 {
return atomic.LoadUint64(&dp.diskErrCnt)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"net"
"strings"
"sync/atomic"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
type RaftCmdItem struct {
Op uint32 `json:"op"`
K []byte `json:"k"`
V []byte `json:"v"`
}
type rndWrtOpItem struct {
opcode uint8
extentID uint64
offset int64
size int64
data []byte
crc uint32
}
// Marshal random write value to binary data.
// Binary frame structure:
// +------+----+------+------+------+------+------+
// | Item | extentID | offset | size | crc | data |
// +------+----+------+------+------+------+------+
// | byte | 8 | 8 | 8 | 4 | size |
// +------+----+------+------+------+------+------+
const (
BinaryMarshalMagicVersion = 0xFF
)
func MarshalRandWriteRaftLog(opcode uint8, extentID uint64, offset, size int64, data []byte, crc uint32) (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0))
buff.Grow(8 + 8*2 + 4 + int(size) + 4 + 4)
if err = binary.Write(buff, binary.BigEndian, uint32(BinaryMarshalMagicVersion)); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, opcode); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, extentID); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, offset); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, size); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, crc); err != nil {
return
}
if _, err = buff.Write(data); err != nil {
return
}
result = buff.Bytes()
return
}
// RandomWriteSubmit submits the proposal to raft.
func UnmarshalRandWriteRaftLog(raw []byte) (opItem *rndWrtOpItem, err error) {
opItem = new(rndWrtOpItem)
buff := bytes.NewBuffer(raw)
var version uint32
if err = binary.Read(buff, binary.BigEndian, &version); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.opcode); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.extentID); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.offset); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.size); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.crc); err != nil {
return
}
opItem.data = make([]byte, opItem.size)
if _, err = buff.Read(opItem.data); err != nil {
return
}
return
}
func MarshalRaftCmd(raftOpItem *RaftCmdItem) (raw []byte, err error) {
if raw, err = json.Marshal(raftOpItem); err != nil {
return
}
return
}
func UnmarshalRaftCmd(raw []byte) (raftOpItem *RaftCmdItem, err error) {
raftOpItem = new(RaftCmdItem)
defer func() {
log.LogDebugf("Unmarsh use oldVersion,result %v", err)
}()
if err = json.Unmarshal(raw, raftOpItem); err != nil {
return
}
return
}
func UnmarshalOldVersionRaftLog(raw []byte) (opItem *rndWrtOpItem, err error) {
raftOpItem := new(RaftCmdItem)
defer func() {
log.LogDebugf("Unmarsh use oldVersion,result %v", err)
}()
if err = json.Unmarshal(raw, raftOpItem); err != nil {
return
}
opItem, err = UnmarshalOldVersionRandWriteOpItem(raftOpItem.V)
if err != nil {
return
}
opItem.opcode = uint8(raftOpItem.Op)
return
}
func UnmarshalOldVersionRandWriteOpItem(raw []byte) (result *rndWrtOpItem, err error) {
var opItem rndWrtOpItem
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &opItem.extentID); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.offset); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.size); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &opItem.crc); err != nil {
return
}
opItem.data = make([]byte, opItem.size)
if _, err = buff.Read(opItem.data); err != nil {
return
}
result = &opItem
return
}
// CheckLeader checks if itself is the leader during read
func (dp *DataPartition) CheckLeader(request *repl.Packet, connect net.Conn) (err error) {
// and use another getRaftLeaderAddr() to return the actual address
_, ok := dp.IsRaftLeader()
if !ok {
err = raft.ErrNotLeader
logContent := fmt.Sprintf("action[ReadCheck] %v.", request.LogMessage(request.GetOpMsg(), connect.RemoteAddr().String(), request.StartT, err))
log.LogWarnf(logContent)
return
}
return
}
type ItemIterator struct {
applyID uint64
}
// NewItemIterator creates a new item iterator.
func NewItemIterator(applyID uint64) *ItemIterator {
si := new(ItemIterator)
si.applyID = applyID
return si
}
// ApplyIndex returns the appliedID
func (si *ItemIterator) ApplyIndex() uint64 {
return si.applyID
}
// Close Closes the iterator.
func (si *ItemIterator) Close() {
// do nothing
}
// Next returns the next item in the iterator.
func (si *ItemIterator) Next() (data []byte, err error) {
// appIDBuf := make([]byte, 8)
// binary.BigEndian.PutUint64(appIDBuf, si.applyID)
// data = appIDBuf[:]
err = io.EOF
return
}
// ApplyRandomWrite random write apply
func (dp *DataPartition) ApplyRandomWrite(command []byte, raftApplyID uint64) (respStatus interface{}, err error) {
opItem := &rndWrtOpItem{}
respStatus = proto.OpOk
defer func() {
if err == nil {
dp.uploadApplyID(raftApplyID)
log.LogDebugf("action[ApplyRandomWrite] dp(%v) raftApplyID(%v) success!", dp.partitionID, raftApplyID)
} else {
if respStatus == proto.OpExistErr { // for tryAppendWrite
err = nil
log.LogDebugf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v) apply err(%v) retry[20]",
raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size, err)
return
}
err = fmt.Errorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v) apply err(%v) retry[20]",
raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size, err)
log.LogErrorf("action[ApplyRandomWrite] Partition(%v) failed err %v", dp.partitionID, err)
exporter.Warning(err.Error())
if respStatus == proto.OpOk {
respStatus = proto.OpDiskErr
}
panic(newRaftApplyError(err))
}
}()
if opItem, err = UnmarshalRandWriteRaftLog(command); err != nil {
log.LogErrorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v) unmarshal failed(%v)", raftApplyID, dp.partitionID, err)
return
}
log.LogDebugf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v)",
raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size)
for i := 0; i < 20; i++ {
dp.disk.allocCheckLimit(proto.FlowWriteType, uint32(opItem.size))
dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
var syncWrite bool
writeType := storage.RandomWriteType
if opItem.opcode == proto.OpRandomWrite || opItem.opcode == proto.OpSyncRandomWrite {
if dp.verSeq > 0 {
err = storage.VerNotConsistentError
log.LogErrorf("action[ApplyRandomWrite] volume [%v] dp [%v] %v,client need update to newest version!", dp.volumeID, dp.partitionID, err)
return
}
} else if opItem.opcode == proto.OpRandomWriteAppend || opItem.opcode == proto.OpSyncRandomWriteAppend {
writeType = storage.AppendRandomWriteType
} else if opItem.opcode == proto.OpTryWriteAppend || opItem.opcode == proto.OpSyncTryWriteAppend {
writeType = storage.AppendWriteType
}
if opItem.opcode == proto.OpSyncRandomWriteAppend || opItem.opcode == proto.OpSyncRandomWrite || opItem.opcode == proto.OpSyncRandomWriteVer {
syncWrite = true
}
dp.disk.limitWrite.Run(int(opItem.size), func() {
respStatus, err = dp.ExtentStore().Write(opItem.extentID, opItem.offset, opItem.size, opItem.data, opItem.crc, writeType, syncWrite)
})
if err == nil {
break
}
if IsDiskErr(err.Error()) {
panic(newRaftApplyError(err))
}
if strings.Contains(err.Error(), storage.ExtentNotFoundError.Error()) {
err = nil
return
}
if (opItem.opcode == proto.OpTryWriteAppend || opItem.opcode == proto.OpSyncTryWriteAppend) && respStatus == proto.OpTryOtherExtent {
err = nil
return
}
log.LogErrorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v) apply err(%v) retry(%v)",
raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size, err, i)
}
return
}
// RandomWriteSubmit submits the proposal to raft.
func (dp *DataPartition) RandomWriteSubmit(pkg *repl.Packet) (err error) {
val, err := MarshalRandWriteRaftLog(pkg.Opcode, pkg.ExtentID, pkg.ExtentOffset, int64(pkg.Size), pkg.Data, pkg.CRC)
if err != nil {
log.LogErrorf("action[RandomWriteSubmit] [%v] marshal error %v", dp.partitionID, err)
return
}
pkg.ResultCode, err = dp.Submit(val)
return
}
func (dp *DataPartition) Submit(val []byte) (retCode uint8, err error) {
var resp interface{}
resp, err = dp.Put(nil, val)
retCode, _ = resp.(uint8)
if err != nil {
log.LogErrorf("action[RandomWriteSubmit] submit err %v", err)
return
}
return
}
func (dp *DataPartition) CheckWriteVer(p *repl.Packet) (err error) {
log.LogDebugf("action[CheckWriteVer] packet %v dpseq %v ", p, dp.verSeq)
if atomic.LoadUint64(&dp.verSeq) == p.VerSeq {
return
}
if p.Opcode == proto.OpSyncRandomWrite || p.Opcode == proto.OpRandomWrite {
err = fmt.Errorf("volume enable mulit version")
log.LogErrorf("action[CheckWriteVer] error %v", err)
return
}
if p.VerSeq < dp.verSeq {
p.ExtentType |= proto.MultiVersionFlag
p.ExtentType |= proto.VersionListFlag
if p.Opcode == proto.OpRandomWriteVer || p.Opcode == proto.OpSyncRandomWriteVer {
err = storage.VerNotConsistentError
log.LogDebugf("action[CheckWriteVer] dp %v client verSeq[%v] small than dataPartiton ver[%v]",
dp.config.PartitionID, p.VerSeq, dp.verSeq)
}
p.VerSeq = dp.verSeq
dp.volVersionInfoList.RWLock.RLock()
p.VerList = make([]*proto.VolVersionInfo, len(dp.volVersionInfoList.VerList))
copy(p.VerList, dp.volVersionInfoList.VerList)
dp.volVersionInfoList.RWLock.RUnlock()
log.LogInfof("action[CheckWriteVer] partitionId %v reqId %v verList %v seq %v dpVerList %v",
p.PartitionID, p.ReqID, p.VerList, p.VerSeq, dp.volVersionInfoList.VerList)
return
} else if p.VerSeq > dp.verSeq {
log.LogWarnf("action[CheckWriteVer] partitionId %v reqId %v verList (%v) seq %v old one(%v)",
p.PartitionID, p.ReqID, p.VerList, p.VerSeq, dp.volVersionInfoList.VerList)
dp.verSeq = p.VerSeq
dp.volVersionInfoList.RWLock.Lock()
dp.volVersionInfoList.VerList = make([]*proto.VolVersionInfo, len(p.VerList))
copy(dp.volVersionInfoList.VerList, p.VerList)
dp.volVersionInfoList.RWLock.Unlock()
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"encoding/binary"
"encoding/json"
"fmt"
"net"
"os"
"path"
"strconv"
"strings"
"sync/atomic"
"time"
raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
type dataPartitionCfg struct {
VolName string `json:"vol_name"`
ClusterID string `json:"cluster_id"`
PartitionID uint64 `json:"partition_id"`
PartitionSize int `json:"partition_size"`
PartitionType int `json:"partition_type"`
Peers []proto.Peer `json:"peers"`
Hosts []string `json:"hosts"`
NodeID uint64 `json:"-"`
RaftStore raftstore.RaftStore `json:"-"`
ReplicaNum int
VerSeq uint64 `json:"ver_seq"`
CreateType int
Forbidden bool
}
func (dp *DataPartition) raftPort() (heartbeat, replica int, err error) {
raftConfig := dp.config.RaftStore.RaftConfig()
heartbeatAddrSplits := strings.Split(raftConfig.HeartbeatAddr, ":")
replicaAddrSplits := strings.Split(raftConfig.ReplicateAddr, ":")
if len(heartbeatAddrSplits) != 2 {
err = errors.New("illegal heartbeat address")
return
}
if len(replicaAddrSplits) != 2 {
err = errors.New("illegal replica address")
return
}
heartbeat, err = strconv.Atoi(heartbeatAddrSplits[1])
if err != nil {
return
}
replica, err = strconv.Atoi(replicaAddrSplits[1])
if err != nil {
return
}
return
}
// StartRaft start raft instance when data partition start or restore.
func (dp *DataPartition) StartRaft(isLoad bool) (err error) {
// cache or preload partition not support raft and repair.
if !dp.isNormalType() {
return nil
}
var (
heartbeatPort int
replicaPort int
peers []raftstore.PeerAddress
)
defer func() {
if r := recover(); r != nil {
mesg := fmt.Sprintf("StartRaft(%v) Raft Panic (%v)", dp.partitionID, r)
log.LogError(mesg)
if isLoad {
err = errors.New(mesg)
} else {
log.LogFlush()
panic(mesg)
}
}
}()
if heartbeatPort, replicaPort, err = dp.raftPort(); err != nil {
return
}
for _, peer := range dp.config.Peers {
addr := strings.Split(peer.Addr, ":")[0]
rp := raftstore.PeerAddress{
Peer: raftproto.Peer{
ID: peer.ID,
},
Address: addr,
HeartbeatPort: heartbeatPort,
ReplicaPort: replicaPort,
}
peers = append(peers, rp)
}
log.LogDebugf("start partition(%v) raft peers: %s path: %s",
dp.partitionID, peers, dp.path)
pc := &raftstore.PartitionConfig{
ID: uint64(dp.partitionID),
Applied: dp.appliedID,
Peers: peers,
SM: dp,
WalPath: dp.path,
}
dp.raftPartition, err = dp.config.RaftStore.CreatePartition(pc)
if err == nil {
dp.ForceSetRaftRunning()
dp.ForceSetDataPartitionToFininshLoad()
}
return
}
func (dp *DataPartition) raftStopped() bool {
return atomic.LoadInt32(&dp.raftStatus) == RaftStatusStopped
}
func (dp *DataPartition) stopRaft() {
if atomic.CompareAndSwapInt32(&dp.raftStatus, RaftStatusRunning, RaftStatusStopped) {
// cache or preload partition not support raft and repair.
if !dp.isNormalType() {
return
}
log.LogErrorf("[FATAL] stop raft partition(%v)", dp.partitionID)
dp.raftPartition.Stop()
}
}
func (dp *DataPartition) CanRemoveRaftMember(peer proto.Peer, force bool) error {
if !dp.isNormalType() {
return fmt.Errorf("CanRemoveRaftMember (%v) not support", dp)
}
downReplicas := dp.config.RaftStore.RaftServer().GetDownReplicas(dp.partitionID)
hasExsit := false
for _, p := range dp.config.Peers {
if p.ID == peer.ID {
hasExsit = true
break
}
}
if !hasExsit {
log.LogInfof("action[CanRemoveRaftMember] replicaNum %v peers %v, peer %v not found", dp.replicaNum, len(dp.config.Peers), peer)
return nil
}
hasDownReplicasExcludePeer := make([]uint64, 0)
for _, nodeID := range downReplicas {
if nodeID.NodeID == peer.ID {
continue
}
// check nodeID is valid
hasDownReplicasExcludePeer = append(hasDownReplicasExcludePeer, nodeID.NodeID)
}
log.LogInfof("action[CanRemoveRaftMember] dp %v replicaNum %v peers %v", dp.partitionID, dp.replicaNum, len(dp.config.Peers))
if dp.replicaNum == 2 && len(dp.config.Peers) == 2 && force {
return nil
}
sumReplicas := len(dp.config.Peers)
if sumReplicas%2 == 1 {
if sumReplicas-len(hasDownReplicasExcludePeer) > (sumReplicas/2 + 1) {
return nil
}
} else {
if sumReplicas-len(hasDownReplicasExcludePeer) >= (sumReplicas/2 + 1) {
return nil
}
}
return fmt.Errorf("hasDownReplicasExcludePeer(%v) too much,so donnot offline (%v)", downReplicas, peer)
}
// StartRaftLoggingSchedule starts the task schedule as follows:
// 1. write the raft applied id into disk.
// 2. collect the applied ids from raft members.
// 3. based on the minimum applied id to cutoff and delete the saved raft log in order to free the disk space.
func (dp *DataPartition) StartRaftLoggingSchedule() {
// cache or preload partition not support raft and repair.
if !dp.isNormalType() {
return
}
getAppliedIDTimer := time.NewTimer(time.Second * 1)
truncateRaftLogTimer := time.NewTimer(time.Minute * 10)
storeAppliedIDTimer := time.NewTimer(time.Second * 10)
log.LogDebugf("[startSchedule] hello DataPartition schedule")
for {
select {
case <-dp.stopC:
log.LogDebugf("[startSchedule] stop partition(%v)", dp.partitionID)
getAppliedIDTimer.Stop()
truncateRaftLogTimer.Stop()
storeAppliedIDTimer.Stop()
return
case extentID := <-dp.stopRaftC:
dp.stopRaft()
log.LogErrorf("action[ExtentRepair] stop raft partition(%v)_%v", dp.partitionID, extentID)
case <-getAppliedIDTimer.C:
if !dp.raftStopped() {
dp.updateMaxMinAppliedID()
}
getAppliedIDTimer.Reset(time.Minute * 1)
case <-truncateRaftLogTimer.C:
if dp.raftStopped() {
break
}
if dp.minAppliedID > dp.lastTruncateID { // Has changed
appliedID := atomic.LoadUint64(&dp.appliedID)
if err := dp.storeAppliedID(appliedID); err != nil {
log.LogErrorf("partition [%v] persist applied ID [%v] during scheduled truncate raft log failed: %v", dp.partitionID, appliedID, err)
truncateRaftLogTimer.Reset(time.Minute)
continue
}
dp.raftPartition.Truncate(dp.minAppliedID)
dp.lastTruncateID = dp.minAppliedID
if err := dp.PersistMetadata(); err != nil {
log.LogErrorf("partition [%v] persist metadata during scheduled truncate raft log failed: %v", dp.partitionID, err)
truncateRaftLogTimer.Reset(time.Minute)
continue
}
log.LogInfof("partition [%v] scheduled truncate raft log [applied: %v, truncated: %v]", dp.partitionID, appliedID, dp.minAppliedID)
}
truncateRaftLogTimer.Reset(time.Minute)
case <-storeAppliedIDTimer.C:
appliedID := atomic.LoadUint64(&dp.appliedID)
if err := dp.storeAppliedID(appliedID); err != nil {
log.LogErrorf("partition [%v] scheduled persist applied ID [%v] failed: %v", dp.partitionID, appliedID, err)
}
storeAppliedIDTimer.Reset(time.Second * 10)
}
}
}
// StartRaftAfterRepair starts the raft after repairing a partition.
// It can only happens after all the extent files are repaired by the leader.
// When the repair is finished, the local dp.partitionSize is same as the leader's dp.partitionSize.
// The repair task can be done in statusUpdateScheduler->LaunchRepair.
func (dp *DataPartition) StartRaftAfterRepair(isLoad bool) {
log.LogDebugf("StartRaftAfterRepair enter")
// cache or preload partition not support raft and repair.
if !dp.isNormalType() {
return
}
var (
initPartitionSize, initMaxExtentID uint64
currLeaderPartitionSize uint64
err error
)
timer := time.NewTicker(5 * time.Second)
for {
select {
case <-timer.C:
err = nil
if dp.isLeader { // primary does not need to wait repair
if err := dp.StartRaft(isLoad); err != nil {
log.LogErrorf("PartitionID(%v) leader start raft err(%v).", dp.partitionID, err)
continue
}
log.LogDebugf("PartitionID(%v) leader started.", dp.partitionID)
return
}
if dp.stopRecover && dp.isDecommissionRecovering() {
log.LogDebugf("action[StartRaftAfterRepair] PartitionID(%v) receive stop signal.", dp.partitionID)
continue
}
// wait for dp.replicas to be updated
if dp.getReplicaLen() == 0 {
continue
}
if initMaxExtentID == 0 || initPartitionSize == 0 {
initMaxExtentID, initPartitionSize, err = dp.getLeaderMaxExtentIDAndPartitionSize()
}
if err != nil {
log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) get MaxExtentID err(%v)", dp.partitionID, err)
continue
}
// get the partition size from the primary and compare it with the loparal one
currLeaderPartitionSize, err = dp.getLeaderPartitionSize(initMaxExtentID)
if err != nil {
log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) get leader size err(%v)", dp.partitionID, err)
continue
}
dp.leaderSize = int(currLeaderPartitionSize)
if currLeaderPartitionSize < initPartitionSize {
initPartitionSize = currLeaderPartitionSize
}
localSize := dp.extentStore.StoreSizeExtentID(initMaxExtentID)
dp.decommissionRepairProgress = float64(localSize) / float64(initPartitionSize)
log.LogInfof("action[StartRaftAfterRepair] PartitionID(%v) initMaxExtentID(%v) initPartitionSize(%v) currLeaderPartitionSize(%v)"+
"localSize(%v)", dp.partitionID, initMaxExtentID, initPartitionSize, currLeaderPartitionSize, localSize)
if initPartitionSize > localSize {
log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) leader size(%v) local size(%v) wait snapshot recover", dp.partitionID, initPartitionSize, localSize)
continue
}
if err := dp.StartRaft(isLoad); err != nil {
log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) start raft err(%v). Retry after 20s.", dp.partitionID, err)
timer.Reset(5 * time.Second)
continue
}
// start raft
dp.DataPartitionCreateType = proto.NormalCreateDataPartition
log.LogInfof("action[StartRaftAfterRepair] PartitionID(%v) change to NormalCreateDataPartition",
dp.partitionID)
dp.decommissionRepairProgress = float64(1)
dp.PersistMetadata()
log.LogInfof("action[StartRaftAfterRepair] PartitionID(%v) raft started!", dp.partitionID)
return
case <-dp.stopC:
log.LogDebugf("action[StartRaftAfterRepair] PartitionID(%v) receive dp stop signal!!.", dp.partitionID)
timer.Stop()
return
}
}
}
// Add a raft node.
func (dp *DataPartition) addRaftNode(req *proto.AddDataPartitionRaftMemberRequest, index uint64) (isUpdated bool, err error) {
// cache or preload partition not support raft and repair.
if !dp.isNormalType() {
return false, fmt.Errorf("addRaftNode (%v) not support", dp)
}
var (
heartbeatPort int
replicaPort int
)
if heartbeatPort, replicaPort, err = dp.raftPort(); err != nil {
return
}
log.LogInfof("action[addRaftNode] add raft node peer [%v]", req.AddPeer)
found := false
for _, peer := range dp.config.Peers {
if peer.ID == req.AddPeer.ID {
found = true
break
}
}
isUpdated = !found
if !isUpdated {
return
}
data, _ := json.Marshal(req)
log.LogInfof("addRaftNode: partitionID(%v) nodeID(%v) index(%v) data(%v) ",
req.PartitionId, dp.config.NodeID, index, string(data))
dp.config.Peers = append(dp.config.Peers, req.AddPeer)
dp.config.Hosts = append(dp.config.Hosts, req.AddPeer.Addr)
dp.replicasLock.Lock()
dp.replicas = make([]string, len(dp.config.Hosts))
copy(dp.replicas, dp.config.Hosts)
dp.replicasLock.Unlock()
addr := strings.Split(req.AddPeer.Addr, ":")[0]
dp.config.RaftStore.AddNodeWithPort(req.AddPeer.ID, addr, heartbeatPort, replicaPort)
return
}
// Delete a raft node.
func (dp *DataPartition) removeRaftNode(req *proto.RemoveDataPartitionRaftMemberRequest, index uint64) (isUpdated bool, err error) {
// cache or preload partition not support raft and repair.
if !dp.isNormalType() {
return false, fmt.Errorf("removeRaftNode (%v) not support", dp)
}
var canRemoveSelf bool
if canRemoveSelf, err = dp.canRemoveSelf(); err != nil {
return
}
peerIndex := -1
data, _ := json.Marshal(req)
isUpdated = false
log.LogInfof("Start RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ",
req.PartitionId, dp.config.NodeID, string(data))
for i, peer := range dp.config.Peers {
if peer.ID == req.RemovePeer.ID {
peerIndex = i
isUpdated = true
break
}
}
if !isUpdated {
log.LogInfof("NoUpdate RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ",
req.PartitionId, dp.config.NodeID, string(data))
return
}
hostIndex := -1
for index, host := range dp.config.Hosts {
if host == req.RemovePeer.Addr {
hostIndex = index
break
}
}
if hostIndex != -1 {
dp.config.Hosts = append(dp.config.Hosts[:hostIndex], dp.config.Hosts[hostIndex+1:]...)
}
dp.config.Peers = append(dp.config.Peers[:peerIndex], dp.config.Peers[peerIndex+1:]...)
if dp.config.NodeID == req.RemovePeer.ID && !dp.IsDataPartitionLoading() && canRemoveSelf {
dp.raftPartition.Delete()
dp.Disk().space.DeletePartition(dp.partitionID)
isUpdated = false
}
// update dp replicas after removing a raft node
if isUpdated {
dp.replicasLock.Lock()
dp.replicas = make([]string, len(dp.config.Hosts))
copy(dp.replicas, dp.config.Hosts)
dp.replicasLock.Unlock()
}
log.LogInfof("Finish RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ",
req.PartitionId, dp.config.NodeID, string(data))
return
}
func (dp *DataPartition) storeAppliedID(applyIndex uint64) (err error) {
filename := path.Join(dp.Path(), TempApplyIndexFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
fp.Close()
os.Remove(filename)
}()
if _, err = fp.WriteString(fmt.Sprintf("%d", applyIndex)); err != nil {
return
}
fp.Sync()
err = os.Rename(filename, path.Join(dp.Path(), ApplyIndexFile))
return
}
// LoadAppliedID loads the applied IDs to the memory.
func (dp *DataPartition) LoadAppliedID() (err error) {
filename := path.Join(dp.Path(), ApplyIndexFile)
if _, err = os.Stat(filename); err != nil {
return
}
data, err := os.ReadFile(filename)
if err != nil {
err = errors.NewErrorf("[loadApplyIndex] OpenFile: %s", err.Error())
return
}
if len(data) == 0 {
err = errors.NewErrorf("[loadApplyIndex]: ApplyIndex is empty")
return
}
if _, err = fmt.Sscanf(string(data), "%d", &dp.appliedID); err != nil {
err = errors.NewErrorf("[loadApplyID] ReadApplyID: %s", err.Error())
return
}
dp.extentStore.ApplyId = dp.appliedID
return
}
func (dp *DataPartition) SetMinAppliedID(id uint64) {
dp.minAppliedID = id
}
func (dp *DataPartition) GetAppliedID() (id uint64) {
return dp.appliedID
}
func (s *DataNode) parseRaftConfig(cfg *config.Config) (err error) {
s.raftDir = cfg.GetString(ConfigKeyRaftDir)
if s.raftDir == "" {
return fmt.Errorf("bad raftDir config")
}
s.tickInterval = int(cfg.GetFloat(CfgTickInterval))
s.raftHeartbeat = cfg.GetString(ConfigKeyRaftHeartbeat)
s.raftReplica = cfg.GetString(ConfigKeyRaftReplica)
s.raftRecvBufSize = int(cfg.GetInt(CfgRaftRecvBufSize))
log.LogDebugf("[parseRaftConfig] load raftDir(%v).", s.raftDir)
log.LogDebugf("[parseRaftConfig] load raftHearbeat(%v).", s.raftHeartbeat)
log.LogDebugf("[parseRaftConfig] load raftReplica(%v).", s.raftReplica)
return
}
func (s *DataNode) startRaftServer(cfg *config.Config) (err error) {
log.LogInfo("Start: startRaftServer")
s.parseRaftConfig(cfg)
if s.clusterUuidEnable {
if err = config.CheckOrStoreClusterUuid(s.raftDir, s.clusterUuid, false); err != nil {
log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err)
}
}
constCfg := config.ConstConfig{
Listen: s.port,
RaftHeartbetPort: s.raftHeartbeat,
RaftReplicaPort: s.raftReplica,
}
ok := false
if ok, err = config.CheckOrStoreConstCfg(s.raftDir, config.DefaultConstConfigFile, &constCfg); !ok {
log.LogErrorf("constCfg check failed %v %v %v %v", s.raftDir, config.DefaultConstConfigFile, constCfg, err)
return fmt.Errorf("constCfg check failed %v %v %v %v", s.raftDir, config.DefaultConstConfigFile, constCfg, err)
}
if _, err = os.Stat(s.raftDir); err != nil {
if err = os.MkdirAll(s.raftDir, 0o755); err != nil {
err = errors.NewErrorf("create raft server dir: %s", err.Error())
log.LogErrorf("action[startRaftServer] cannot start raft server err(%v)", err)
return
}
}
heartbeatPort, err := strconv.Atoi(s.raftHeartbeat)
if err != nil {
err = errors.NewErrorf("Raft heartbeat port configuration error: %s", err.Error())
return
}
replicatePort, err := strconv.Atoi(s.raftReplica)
if err != nil {
err = errors.NewErrorf("Raft replica port configuration error: %s", err.Error())
return
}
raftConf := &raftstore.Config{
NodeID: s.nodeID,
RaftPath: s.raftDir,
IPAddr: LocalIP,
HeartbeatPort: heartbeatPort,
ReplicaPort: replicatePort,
NumOfLogsToRetain: DefaultRaftLogsToRetain,
TickInterval: s.tickInterval,
RecvBufSize: s.raftRecvBufSize,
}
s.raftStore, err = raftstore.NewRaftStore(raftConf, cfg)
if err != nil {
err = errors.NewErrorf("new raftStore: %s", err.Error())
log.LogErrorf("action[startRaftServer] cannot start raft server err(%v)", err)
}
return
}
func (s *DataNode) stopRaftServer() {
if s.raftStore != nil {
s.raftStore.Stop()
}
}
// NewPacketToBroadcastMinAppliedID returns a new packet to broadcast the min applied ID.
func NewPacketToBroadcastMinAppliedID(partitionID uint64, minAppliedID uint64) (p *repl.Packet) {
p = new(repl.Packet)
p.Opcode = proto.OpBroadcastMinAppliedID
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ReqID = proto.GenerateRequestID()
p.Data = make([]byte, 8)
binary.BigEndian.PutUint64(p.Data, minAppliedID)
p.Size = uint32(len(p.Data))
return
}
// NewPacketToGetAppliedID returns a new packet to get the applied ID.
func NewPacketToGetAppliedID(partitionID uint64) (p *repl.Packet) {
p = new(repl.Packet)
p.Opcode = proto.OpGetAppliedId
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ReqID = proto.GenerateRequestID()
return
}
// NewPacketToGetPartitionSize returns a new packet to get the partition size.
func NewPacketToGetPartitionSize(partitionID uint64) (p *repl.Packet) {
p = new(repl.Packet)
p.Opcode = proto.OpGetPartitionSize
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ReqID = proto.GenerateRequestID()
return
}
// NewPacketToGetPartitionSize returns a new packet to get the partition size.
func NewPacketToGetMaxExtentIDAndPartitionSIze(partitionID uint64) (p *repl.Packet) {
p = new(repl.Packet)
p.Opcode = proto.OpGetMaxExtentIDAndPartitionSize
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ReqID = proto.GenerateRequestID()
return
}
func (dp *DataPartition) findMinAppliedID(allAppliedIDs []uint64) (minAppliedID uint64, index int) {
index = 0
minAppliedID = allAppliedIDs[0]
for i := 1; i < len(allAppliedIDs); i++ {
if allAppliedIDs[i] < minAppliedID {
minAppliedID = allAppliedIDs[i]
index = i
}
}
return minAppliedID, index
}
func (dp *DataPartition) findMaxAppliedID(allAppliedIDs []uint64) (maxAppliedID uint64, index int) {
for i := 0; i < len(allAppliedIDs); i++ {
if allAppliedIDs[i] > maxAppliedID {
maxAppliedID = allAppliedIDs[i]
index = i
}
}
return maxAppliedID, index
}
// Get the partition size from the leader.
func (dp *DataPartition) getLeaderPartitionSize(maxExtentID uint64) (size uint64, err error) {
var conn *net.TCPConn
p := NewPacketToGetPartitionSize(dp.partitionID)
p.ExtentID = maxExtentID
target := dp.getReplicaAddr(0)
conn, err = gConnPool.GetConnect(target) // get remote connect
if err != nil {
err = errors.Trace(err, " partition(%v) get host(%v) connect", dp.partitionID, target)
return
}
defer func() {
gConnPool.PutConnect(conn, err != nil)
}()
err = p.WriteToConn(conn) // write command to the remote host
if err != nil {
err = errors.Trace(err, "partition(%v) write to host(%v)", dp.partitionID, target)
return
}
err = p.ReadFromConnWithVer(conn, 60)
if err != nil {
err = errors.Trace(err, "partition(%v) read from host(%v)", dp.partitionID, target)
return
}
if p.ResultCode != proto.OpOk {
err = errors.Trace(err, "partition(%v) result code not ok (%v) from host(%v)", dp.partitionID, p.ResultCode, target)
return
}
size = binary.BigEndian.Uint64(p.Data)
log.LogInfof("partition(%v) MaxExtentID(%v) size(%v)", dp.partitionID, maxExtentID, size)
return
}
func (dp *DataPartition) getMaxExtentIDAndPartitionSize(target string) (maxExtentID, PartitionSize uint64, err error) {
var conn *net.TCPConn
p := NewPacketToGetMaxExtentIDAndPartitionSIze(dp.partitionID)
conn, err = gConnPool.GetConnect(target) // get remote connect
if err != nil {
err = errors.Trace(err, " partition(%v) get host(%v) connect", dp.partitionID, target)
return
}
defer func() {
gConnPool.PutConnect(conn, err != nil)
}()
err = p.WriteToConn(conn) // write command to the remote host
if err != nil {
err = errors.Trace(err, "partition(%v) write to host(%v)", dp.partitionID, target)
return
}
err = p.ReadFromConnWithVer(conn, 60)
if err != nil {
err = errors.Trace(err, "partition(%v) read from host(%v)", dp.partitionID, target)
return
}
if p.ResultCode != proto.OpOk {
err = errors.Trace(err, "partition(%v) result code not ok (%v) from host(%v)", dp.partitionID, p.ResultCode, target)
return
}
maxExtentID = binary.BigEndian.Uint64(p.Data[0:8])
PartitionSize = binary.BigEndian.Uint64(p.Data[8:16])
log.LogInfof("partition(%v) maxExtentID(%v) PartitionSize(%v) on leader", dp.partitionID, maxExtentID, PartitionSize)
return
}
// Get the MaxExtentID partition from the leader.
func (dp *DataPartition) getLeaderMaxExtentIDAndPartitionSize() (maxExtentID, PartitionSize uint64, err error) {
target := dp.getReplicaAddr(0)
return dp.getMaxExtentIDAndPartitionSize(target)
}
// Get the MaxExtentID partition from the leader.
func (dp *DataPartition) getMemberExtentIDAndPartitionSize() (maxExtentID, PartitionSize uint64, err error) {
target := dp.getReplicaAddr(1)
return dp.getMaxExtentIDAndPartitionSize(target)
}
func (dp *DataPartition) broadcastMinAppliedID(minAppliedID uint64) (err error) {
for i := 0; i < dp.getReplicaLen(); i++ {
p := NewPacketToBroadcastMinAppliedID(dp.partitionID, minAppliedID)
replicaHostParts := strings.Split(dp.getReplicaAddr(i), ":")
replicaHost := strings.TrimSpace(replicaHostParts[0])
if LocalIP == replicaHost {
log.LogDebugf("partition(%v) local no send msg. localIP(%v) replicaHost(%v) appliedId(%v)",
dp.partitionID, LocalIP, replicaHost, dp.appliedID)
dp.minAppliedID = minAppliedID
continue
}
target := dp.getReplicaAddr(i)
var conn *net.TCPConn
conn, err = gConnPool.GetConnect(target)
if err != nil {
return
}
err = p.WriteToConn(conn)
if err != nil {
gConnPool.PutConnect(conn, true)
return
}
err = p.ReadFromConnWithVer(conn, 60)
if err != nil {
gConnPool.PutConnect(conn, true)
return
}
gConnPool.PutConnect(conn, false)
log.LogDebugf("partition(%v) minAppliedID(%v)", dp.partitionID, minAppliedID)
}
return
}
// Get all replica applied ids
func (dp *DataPartition) getAllReplicaAppliedID() (allAppliedID []uint64, replyNum uint8) {
allAppliedID = make([]uint64, dp.getReplicaLen())
for i := 0; i < dp.getReplicaLen(); i++ {
p := NewPacketToGetAppliedID(dp.partitionID)
replicaHostParts := strings.Split(dp.getReplicaAddr(i), ":")
replicaHost := strings.TrimSpace(replicaHostParts[0])
if LocalIP == replicaHost {
log.LogDebugf("partition(%v) local no send msg. localIP(%v) replicaHost(%v) appliedId(%v)",
dp.partitionID, LocalIP, replicaHost, dp.appliedID)
allAppliedID[i] = dp.appliedID
replyNum++
continue
}
target := dp.getReplicaAddr(i)
appliedID, err := dp.getRemoteAppliedID(target, p)
if err != nil {
log.LogErrorf("partition(%v) getRemoteAppliedID Failed(%v).", dp.partitionID, err)
continue
}
if appliedID == 0 {
log.LogDebugf("[getAllReplicaAppliedID] partition(%v) local appliedID(%v) replicaHost(%v) appliedID=0",
dp.partitionID, dp.appliedID, replicaHost)
}
allAppliedID[i] = appliedID
replyNum++
}
return
}
// Get target members' applied id
func (dp *DataPartition) getRemoteAppliedID(target string, p *repl.Packet) (appliedID uint64, err error) {
var conn *net.TCPConn
start := time.Now().UnixNano()
defer func() {
if err != nil {
err = fmt.Errorf(p.LogMessage(p.GetOpMsg(), target, start, err))
log.LogErrorf(err.Error())
}
}()
conn, err = gConnPool.GetConnect(target)
if err != nil {
return
}
defer func() {
gConnPool.PutConnect(conn, err != nil)
}()
err = p.WriteToConn(conn) // write command to the remote host
if err != nil {
return
}
err = p.ReadFromConnWithVer(conn, 60)
if err != nil {
return
}
if p.ResultCode != proto.OpOk {
err = errors.NewErrorf("partition(%v) result code not ok (%v) from host(%v)", dp.partitionID, p.ResultCode, target)
return
}
appliedID = binary.BigEndian.Uint64(p.Data)
log.LogDebugf("[getRemoteAppliedID] partition(%v) remoteAppliedID(%v)", dp.partitionID, appliedID)
return
}
// Get all members' applied ids and find the minimum one
func (dp *DataPartition) updateMaxMinAppliedID() {
var (
minAppliedID uint64
maxAppliedID uint64
)
// Get the applied id by the leader
_, isLeader := dp.IsRaftLeader()
if !isLeader {
return
}
// if leader has not applied the raft, no need to get others
if dp.appliedID == 0 {
return
}
allAppliedID, replyNum := dp.getAllReplicaAppliedID()
if replyNum == 0 {
log.LogDebugf("[updateMaxMinAppliedID] PartitionID(%v) Get appliedId failed!", dp.partitionID)
return
}
if replyNum == uint8(len(allAppliedID)) { // update dp.minAppliedID when every member had replied
minAppliedID, _ = dp.findMinAppliedID(allAppliedID)
log.LogDebugf("[updateMaxMinAppliedID] PartitionID(%v) localID(%v) OK! oldMinID(%v) newMinID(%v) allAppliedID(%v)",
dp.partitionID, dp.appliedID, dp.minAppliedID, minAppliedID, allAppliedID)
dp.broadcastMinAppliedID(minAppliedID)
}
maxAppliedID, _ = dp.findMaxAppliedID(allAppliedID)
log.LogDebugf("[updateMaxMinAppliedID] PartitionID(%v) localID(%v) OK! oldMaxID(%v) newMaxID(%v)",
dp.partitionID, dp.appliedID, dp.maxAppliedID, maxAppliedID)
dp.maxAppliedID = maxAppliedID
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"sync"
"sync/atomic"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
/* The functions below implement the interfaces defined in the raft library. */
// Apply puts the data onto the disk.
func (dp *DataPartition) Apply(command []byte, index uint64) (resp interface{}, err error) {
buff := bytes.NewBuffer(command)
var version uint32
if err = binary.Read(buff, binary.BigEndian, &version); err != nil {
return
}
resp = proto.OpOk
if version != BinaryMarshalMagicVersion {
var opItem *RaftCmdItem
if opItem, err = UnmarshalRaftCmd(command); err != nil {
log.LogErrorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v) unmarshal failed(%v)", index, dp.partitionID, err)
return
}
log.LogInfof("[ApplyRandomWrite] ApplyID(%v) Partition(%v) opItem Op(%v)", index, dp.partitionID, opItem.Op)
if opItem.Op == uint32(proto.OpVersionOp) {
dp.fsmVersionOp(opItem)
return
}
return
}
if index > dp.metaAppliedID {
resp, err = dp.ApplyRandomWrite(command, index)
return
}
log.LogDebugf("[DataPartition.Apply] dp[%v] metaAppliedID(%v) index(%v) no need apply", dp.partitionID, dp.metaAppliedID, index)
return
}
// ApplyMemberChange supports adding new raft member or deleting an existing raft member.
// It does not support updating an existing member at this point.
func (dp *DataPartition) ApplyMemberChange(confChange *raftproto.ConfChange, index uint64) (resp interface{}, err error) {
defer func(index uint64) {
if err == nil {
dp.uploadApplyID(index)
} else {
err = fmt.Errorf("[ApplyMemberChange] ApplyID(%v) Partition(%v) apply err(%v)]", index, dp.partitionID, err)
exporter.Warning(err.Error())
panic(newRaftApplyError(err))
}
}(index)
// Change memory the status
var (
isUpdated bool
)
switch confChange.Type {
case raftproto.ConfAddNode:
req := &proto.AddDataPartitionRaftMemberRequest{}
if err = json.Unmarshal(confChange.Context, req); err != nil {
return
}
log.LogInfof("action[ApplyMemberChange] ConfAddNode [%v], partitionId [%v]", req.AddPeer, req.PartitionId)
isUpdated, err = dp.addRaftNode(req, index)
if isUpdated && err == nil {
// Perform the update replicas operation asynchronously after the execution of the member change applying
// related process.
updateWG := sync.WaitGroup{}
updateWG.Add(1)
go func() {
defer updateWG.Done()
//may fetch old replica, e.g. 3-replica back to 2-replica for adding raft member not return
//if err = dp.updateReplicas(true); err != nil {
// log.LogErrorf("ApplyMemberChange: update partition %v replicas failed: %v", dp.partitionID, err)
// return
//}
if dp.isLeader {
dp.ExtentStore().MoveAllToBrokenTinyExtentC(storage.TinyExtentCount)
}
}()
updateWG.Wait()
}
case raftproto.ConfRemoveNode:
req := &proto.RemoveDataPartitionRaftMemberRequest{}
if err = json.Unmarshal(confChange.Context, req); err != nil {
return
}
log.LogInfof("action[ApplyMemberChange] ConfRemoveNode [%v], partitionId [%v]", req.RemovePeer, req.PartitionId)
isUpdated, err = dp.removeRaftNode(req, index)
case raftproto.ConfUpdateNode:
log.LogDebugf("[updateRaftNode]: not support.")
default:
// do nothing
}
if err != nil {
log.LogErrorf("action[ApplyMemberChange] dp(%v) type(%v) err(%v).", dp.partitionID, confChange.Type, err)
if IsDiskErr(err.Error()) {
panic(newRaftApplyError(err))
}
return
}
if isUpdated {
dp.DataPartitionCreateType = proto.NormalCreateDataPartition
if err = dp.PersistMetadata(); err != nil {
log.LogErrorf("action[ApplyMemberChange] dp(%v) PersistMetadata err(%v).", dp.partitionID, err)
if IsDiskErr(err.Error()) {
panic(newRaftApplyError(err))
}
return
}
}
return
}
// Snapshot persists the in-memory data (as a snapshot) to the disk.
// Note that the data in each data partition has already been saved on the disk. Therefore there is no need to take the
// snapshot in this case.
func (dp *DataPartition) Snapshot() (raftproto.Snapshot, error) {
snapIterator := NewItemIterator(dp.raftPartition.AppliedIndex())
log.LogInfof("SendSnapShot PartitionID(%v) Snapshot lastTruncateID(%v) currentApplyID(%v) firstCommitID(%v)",
dp.partitionID, dp.lastTruncateID, dp.appliedID, dp.raftPartition.CommittedIndex())
return snapIterator, nil
}
// ApplySnapshot asks the raft leader for the snapshot data to recover the contents on the local disk.
func (dp *DataPartition) ApplySnapshot(peers []raftproto.Peer, iterator raftproto.SnapIterator) (err error) {
// Never delete the raft log which hadn't applied, so snapshot no need.
log.LogInfof("PartitionID(%v) ApplySnapshot to (%v)", dp.partitionID, dp.raftPartition.CommittedIndex())
return
}
// HandleFatalEvent notifies the application when panic happens.
func (dp *DataPartition) HandleFatalEvent(err *raft.FatalError) {
if isRaftApplyError(err.Err.Error()) {
dp.stopRaft()
dp.checkIsDiskError(err.Err, 0)
log.LogCriticalf("action[HandleFatalEvent] raft apply err(%v), partitionId:%v", err, dp.partitionID)
} else {
log.LogFatalf("action[HandleFatalEvent] err(%v), partitionId:%v", err, dp.partitionID)
}
}
// HandleLeaderChange notifies the application when the raft leader has changed.
func (dp *DataPartition) HandleLeaderChange(leader uint64) {
defer func() {
if r := recover(); r != nil {
mesg := fmt.Sprintf("HandleLeaderChange(%v) Raft Panic (%v)", dp.partitionID, r)
panic(mesg)
}
}()
if dp.config.NodeID == leader {
dp.isRaftLeader = true
}
}
// Put submits the raft log to the raft store.
func (dp *DataPartition) Put(key interface{}, val interface{}) (resp interface{}, err error) {
if dp.raftStopped() {
err = fmt.Errorf("%s key=%v", RaftNotStarted, key)
return
}
resp, err = dp.raftPartition.Submit(val.([]byte))
return
}
// Get returns the raft log based on the given key. It is not needed for replicating data partition.
func (dp *DataPartition) Get(key interface{}) (interface{}, error) {
return nil, nil
}
// Del deletes the raft log based on the given key. It is not needed for replicating data partition.
func (dp *DataPartition) Del(key interface{}) (interface{}, error) {
return nil, nil
}
func (dp *DataPartition) uploadApplyID(applyID uint64) {
atomic.StoreUint64(&dp.appliedID, applyID)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"bytes"
"errors"
"fmt"
"net"
"net/http"
"os"
"os/exec"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/cmd/common"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/repl"
masterSDK "github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/atomicutil"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/loadutil"
"github.com/cubefs/cubefs/util/log"
"github.com/xtaci/smux"
)
var (
ErrIncorrectStoreType = errors.New("Incorrect store type")
ErrNoSpaceToCreatePartition = errors.New("No disk space to create a data partition")
ErrNewSpaceManagerFailed = errors.New("Creater new space manager failed")
ErrGetMasterDatanodeInfoFailed = errors.New("Failed to get datanode info from master")
LocalIP, serverPort string
gConnPool = util.NewConnectPool()
// MasterClient = masterSDK.NewMasterClient(nil, false)
MasterClient *masterSDK.MasterCLientWithResolver
)
const (
DefaultZoneName = proto.DefaultZoneName
DefaultRaftDir = "raft"
DefaultRaftLogsToRetain = 10 // Count of raft logs per data partition
DefaultDiskMaxErr = 1
DefaultDiskRetainMin = 5 * util.GB // GB
DefaultNameResolveInterval = 1 // minutes
DefaultDiskUnavailableErrorCount = 5
DefaultDiskUnavailablePartitionErrorCount = 3
)
const (
ModuleName = "dataNode"
)
const (
ConfigKeyLocalIP = "localIP" // string
ConfigKeyPort = "port" // int
ConfigKeyMasterAddr = "masterAddr" // array
ConfigKeyZone = "zoneName" // string
ConfigKeyDisks = "disks" // array
ConfigKeyRaftDir = "raftDir" // string
ConfigKeyRaftHeartbeat = "raftHeartbeat" // string
ConfigKeyRaftReplica = "raftReplica" // string
CfgTickInterval = "tickInterval" // int
CfgRaftRecvBufSize = "raftRecvBufSize" // int
ConfigKeyDiskPath = "diskPath" // string
configNameResolveInterval = "nameResolveInterval" // int
/*
* Metrics Degrade Level
* minus value: turn off metrics collection.
* 0 or 1: full metrics.
* 2: 1/2 of the metrics will be collected.
* 3: 1/3 of the metrics will be collected.
* ...
*/
CfgMetricsDegrade = "metricsDegrade" // int
CfgDiskRdonlySpace = "diskRdonlySpace" // int
// smux Config
ConfigKeyEnableSmuxClient = "enableSmuxConnPool" // bool
ConfigKeySmuxPortShift = "smuxPortShift" // int
ConfigKeySmuxMaxConn = "smuxMaxConn" // int
ConfigKeySmuxStreamPerConn = "smuxStreamPerConn" // int
ConfigKeySmuxMaxBuffer = "smuxMaxBuffer" // int
ConfigKeySmuxTotalStream = "sumxTotalStream" // int
// rate limit control enable
ConfigDiskQosEnable = "diskQosEnable" // bool
ConfigDiskReadIocc = "diskReadIocc" // int
ConfigDiskReadIops = "diskReadIops" // int
ConfigDiskReadFlow = "diskReadFlow" // int
ConfigDiskWriteIocc = "diskWriteIocc" // int
ConfigDiskWriteIops = "diskWriteIops" // int
ConfigDiskWriteFlow = "diskWriteFlow" // int
ConfigServiceIDKey = "serviceIDKey"
// disk status becomes unavailable if disk error partition count reaches this value
ConfigKeyDiskUnavailablePartitionErrorCount = "diskUnavailablePartitionErrorCount"
)
const cpuSampleDuration = 1 * time.Second
// DataNode defines the structure of a data node.
type DataNode struct {
space *SpaceManager
port string
zoneName string
clusterID string
localIP string
bindIp bool
localServerAddr string
nodeID uint64
raftDir string
raftHeartbeat string
raftReplica string
raftStore raftstore.RaftStore
tickInterval int
raftRecvBufSize int
startTime int64
tcpListener net.Listener
stopC chan bool
smuxPortShift int
enableSmuxConnPool bool
smuxConnPool *util.SmuxConnectPool
smuxListener net.Listener
smuxServerConfig *smux.Config
smuxConnPoolConfig *util.SmuxConnPoolConfig
getRepairConnFunc func(target string) (net.Conn, error)
putRepairConnFunc func(conn net.Conn, forceClose bool)
metrics *DataNodeMetrics
metricsDegrade int64
metricsCnt uint64
volUpdating sync.Map // map[string]*verOp2Phase
control common.Control
diskQosEnable bool
diskQosEnableFromMaster bool
diskReadIocc int
diskReadIops int
diskReadFlow int
diskWriteIocc int
diskWriteIops int
diskWriteFlow int
dpMaxRepairErrCnt uint64
dpRepairTimeOut uint64
clusterUuid string
clusterUuidEnable bool
serviceIDKey string
cpuUtil atomicutil.Float64
cpuSamplerDone chan struct{}
diskUnavailablePartitionErrorCount uint64 // disk status becomes unavailable when disk error partition count reaches this value
}
type verOp2Phase struct {
verSeq uint64
verPrepare uint64
status uint32
step uint32
op uint8
sync.Mutex
}
func NewServer() *DataNode {
return &DataNode{}
}
func (s *DataNode) Start(cfg *config.Config) (err error) {
runtime.GOMAXPROCS(runtime.NumCPU())
return s.control.Start(s, cfg, doStart)
}
// Shutdown shuts down the current data node.
func (s *DataNode) Shutdown() {
s.control.Shutdown(s, doShutdown)
}
// Sync keeps data node in sync.
func (s *DataNode) Sync() {
s.control.Sync()
}
// Workflow of starting up a data node.
func doStart(server common.Server, cfg *config.Config) (err error) {
s, ok := server.(*DataNode)
if !ok {
return errors.New("Invalid node Type!")
}
s.stopC = make(chan bool)
// parse the config file
if err = s.parseConfig(cfg); err != nil {
return
}
exporter.Init(ModuleName, cfg)
s.registerMetrics()
s.register(cfg)
// parse the smux config
if err = s.parseSmuxConfig(cfg); err != nil {
return
}
// connection pool must be created before initSpaceManager
s.initConnPool()
// init limit
initRepairLimit()
// start the raft server
if err = s.startRaftServer(cfg); err != nil {
return
}
// create space manager (disk, partition, etc.)
if err = s.startSpaceManager(cfg); err != nil {
return
}
// check local partition compare with master ,if lack,then not start
if _, err = s.checkLocalPartitionMatchWithMaster(); err != nil {
log.LogError(err)
exporter.Warning(err.Error())
return
}
// tcp listening & tcp connection pool
if err = s.startTCPService(); err != nil {
return
}
// smux listening & smux connection pool
if err = s.startSmuxService(cfg); err != nil {
return
}
go s.registerHandler()
s.scheduleTask()
// start metrics (LackDpCount, etc.)
s.startMetrics()
// start cpu sampler
s.startCpuSample()
return
}
func doShutdown(server common.Server) {
s, ok := server.(*DataNode)
if !ok {
return
}
s.closeMetrics()
close(s.stopC)
s.space.Stop()
s.stopUpdateNodeInfo()
s.stopTCPService()
s.stopRaftServer()
s.stopSmuxService()
s.closeSmuxConnPool()
MasterClient.Stop()
// stop cpu sample
close(s.cpuSamplerDone)
}
func (s *DataNode) parseConfig(cfg *config.Config) (err error) {
var (
port string
regexpPort *regexp.Regexp
)
LocalIP = cfg.GetString(ConfigKeyLocalIP)
port = cfg.GetString(proto.ListenPort)
s.bindIp = cfg.GetBool(proto.BindIpKey)
serverPort = port
if regexpPort, err = regexp.Compile(`^(\d)+$`); err != nil {
return fmt.Errorf("Err:no port")
}
if !regexpPort.MatchString(port) {
return fmt.Errorf("Err:port must string")
}
s.port = port
/*for _, ip := range cfg.GetSlice(proto.MasterAddr) {
MasterClient.AddNode(ip.(string))
}*/
updateInterval := cfg.GetInt(configNameResolveInterval)
if updateInterval <= 0 || updateInterval > 60 {
log.LogWarnf("name resolving interval[1-60] is set to default: %v", DefaultNameResolveInterval)
updateInterval = DefaultNameResolveInterval
}
addrs := cfg.GetSlice(proto.MasterAddr)
if len(addrs) == 0 {
return fmt.Errorf("Err:masterAddr unavalid")
}
masters := make([]string, 0, len(addrs))
for _, addr := range addrs {
masters = append(masters, addr.(string))
}
MasterClient = masterSDK.NewMasterCLientWithResolver(masters, false, updateInterval)
if MasterClient == nil {
err = fmt.Errorf("parseConfig: masters addrs format err[%v]", masters)
log.LogErrorf("parseConfig: masters addrs format err[%v]", masters)
return err
}
if err = MasterClient.Start(); err != nil {
return err
}
s.zoneName = cfg.GetString(ConfigKeyZone)
if s.zoneName == "" {
s.zoneName = DefaultZoneName
}
s.metricsDegrade = cfg.GetInt64(CfgMetricsDegrade)
s.serviceIDKey = cfg.GetString(ConfigServiceIDKey)
diskUnavailablePartitionErrorCount := cfg.GetInt64(ConfigKeyDiskUnavailablePartitionErrorCount)
if diskUnavailablePartitionErrorCount <= 0 || diskUnavailablePartitionErrorCount > 100 {
diskUnavailablePartitionErrorCount = DefaultDiskUnavailablePartitionErrorCount
log.LogDebugf("action[parseConfig] ConfigKeyDiskUnavailablePartitionErrorCount(%v) out of range, set as default(%v)",
diskUnavailablePartitionErrorCount, DefaultDiskUnavailablePartitionErrorCount)
}
s.diskUnavailablePartitionErrorCount = uint64(diskUnavailablePartitionErrorCount)
log.LogDebugf("action[parseConfig] load diskUnavailablePartitionErrorCount(%v)", s.diskUnavailablePartitionErrorCount)
log.LogDebugf("action[parseConfig] load masterAddrs(%v).", MasterClient.Nodes())
log.LogDebugf("action[parseConfig] load port(%v).", s.port)
log.LogDebugf("action[parseConfig] load zoneName(%v).", s.zoneName)
return
}
func (s *DataNode) initQosLimit(cfg *config.Config) {
dn := s.space.dataNode
dn.diskQosEnable = cfg.GetBoolWithDefault(ConfigDiskQosEnable, true)
dn.diskReadIocc = cfg.GetInt(ConfigDiskReadIocc)
dn.diskReadIops = cfg.GetInt(ConfigDiskReadIops)
dn.diskReadFlow = cfg.GetInt(ConfigDiskReadFlow)
dn.diskWriteIocc = cfg.GetInt(ConfigDiskWriteIocc)
dn.diskWriteIops = cfg.GetInt(ConfigDiskWriteIops)
dn.diskWriteFlow = cfg.GetInt(ConfigDiskWriteFlow)
log.LogWarnf("action[initQosLimit] set qos [%v], read(iocc:%d iops:%d flow:%d) write(iocc:%d iops:%d flow:%d)",
dn.diskQosEnable, dn.diskReadIocc, dn.diskReadIops, dn.diskReadFlow, dn.diskWriteIocc, dn.diskWriteIops, dn.diskWriteFlow)
}
func (s *DataNode) updateQosLimit() {
for _, disk := range s.space.disks {
disk.updateQosLimiter()
}
}
func (s *DataNode) startSpaceManager(cfg *config.Config) (err error) {
s.startTime = time.Now().Unix()
s.space = NewSpaceManager(s)
if len(strings.TrimSpace(s.port)) == 0 {
err = ErrNewSpaceManagerFailed
return
}
s.space.SetRaftStore(s.raftStore)
s.space.SetNodeID(s.nodeID)
s.space.SetClusterID(s.clusterID)
s.initQosLimit(cfg)
diskRdonlySpace := uint64(cfg.GetInt64(CfgDiskRdonlySpace))
if diskRdonlySpace < DefaultDiskRetainMin {
diskRdonlySpace = DefaultDiskRetainMin
}
log.LogInfof("startSpaceManager preReserveSpace %d", diskRdonlySpace)
paths := make([]string, 0)
diskPath := cfg.GetString(ConfigKeyDiskPath)
if diskPath != "" {
paths, err = parseDiskPath(diskPath)
if err != nil {
log.LogErrorf("parse diskpath failed, path %s, err %s", diskPath, err.Error())
return err
}
} else {
for _, p := range cfg.GetSlice(ConfigKeyDisks) {
paths = append(paths, p.(string))
}
}
var wg sync.WaitGroup
for _, d := range paths {
log.LogDebugf("action[startSpaceManager] load disk raw config(%v).", d)
// format "PATH:RESET_SIZE
arr := strings.Split(d, ":")
if len(arr) != 2 {
return errors.New("Invalid disk configuration. Example: PATH:RESERVE_SIZE")
}
path := arr[0]
fileInfo, err := os.Stat(path)
if err != nil {
log.LogErrorf("Stat disk path [%v] error: [%s]", path, err)
continue
}
if !fileInfo.IsDir() {
return errors.New("Disk path is not dir")
}
if s.clusterUuidEnable {
if err = config.CheckOrStoreClusterUuid(path, s.clusterUuid, false); err != nil {
log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err.Error())
}
}
reservedSpace, err := strconv.ParseUint(arr[1], 10, 64)
if err != nil {
return fmt.Errorf("Invalid disk reserved space. Error: %s", err.Error())
}
if reservedSpace < DefaultDiskRetainMin {
reservedSpace = DefaultDiskRetainMin
}
wg.Add(1)
go func(wg *sync.WaitGroup, path string, reservedSpace uint64) {
defer wg.Done()
s.space.LoadDisk(path, reservedSpace, diskRdonlySpace, DefaultDiskMaxErr)
}(&wg, path, reservedSpace)
}
wg.Wait()
// start async sample
s.space.StartDiskSample()
s.updateQosLimit() // load from config
return nil
}
// execute shell to find all paths
// out: like, /disk1:1024, /disk2:1024
func parseDiskPath(pathStr string) (disks []string, err error) {
log.LogInfof("parse diskpath, %s", pathStr)
arr := strings.Split(pathStr, ":")
if len(arr) != 2 {
return disks, fmt.Errorf("diskPath cfg should be diskPathPrefix:RESERVE_SIZE")
}
shell := fmt.Sprintf("mount | grep %s | awk '{print $3}'", arr[0])
cmd := exec.Command("/bin/sh", "-c", shell)
log.LogWarnf("execute diskPath shell, %s", shell)
out, err := cmd.CombinedOutput()
if err != nil {
return disks, fmt.Errorf("execute shell failed, %s", err.Error())
}
disks = make([]string, 0)
lines := bytes.Split(out, []byte("\n"))
for _, line := range lines {
str := strings.TrimSpace(string(line))
if str == "" {
continue
}
disks = append(disks, fmt.Sprintf("%s:%s", string(line), arr[1]))
}
return disks, nil
}
// registers the data node on the master to report the information such as IsIPV4 address.
// The startup of a data node will be blocked until the registration succeeds.
func (s *DataNode) register(cfg *config.Config) {
var err error
timer := time.NewTimer(0)
// get the IsIPV4 address, cluster ID and node ID from the master
for {
select {
case <-timer.C:
var ci *proto.ClusterInfo
if ci, err = MasterClient.AdminAPI().GetClusterInfo(); err != nil {
log.LogErrorf("action[registerToMaster] cannot get ip from master(%v) err(%v).",
MasterClient.Leader(), err)
timer.Reset(2 * time.Second)
continue
}
masterAddr := MasterClient.Leader()
s.clusterUuid = ci.ClusterUuid
s.clusterUuidEnable = ci.ClusterUuidEnable
s.clusterID = ci.Cluster
if LocalIP == "" {
LocalIP = string(ci.Ip)
}
s.localServerAddr = fmt.Sprintf("%s:%v", LocalIP, s.port)
if !util.IsIPV4(LocalIP) {
log.LogErrorf("action[registerToMaster] got an invalid local ip(%v) from master(%v).",
LocalIP, masterAddr)
timer.Reset(2 * time.Second)
continue
}
// register this data node on the master
var nodeID uint64
if nodeID, err = MasterClient.NodeAPI().AddDataNodeWithAuthNode(fmt.Sprintf("%s:%v", LocalIP, s.port),
s.zoneName, s.serviceIDKey); err != nil {
log.LogErrorf("action[registerToMaster] cannot register this node to master[%v] err(%v).",
masterAddr, err)
timer.Reset(2 * time.Second)
continue
}
exporter.RegistConsul(s.clusterID, ModuleName, cfg)
s.nodeID = nodeID
log.LogDebugf("register: register DataNode: nodeID(%v)", s.nodeID)
return
case <-s.stopC:
timer.Stop()
return
}
}
}
type DataNodeInfo struct {
Addr string
PersistenceDataPartitions []uint64
}
func (s *DataNode) checkLocalPartitionMatchWithMaster() (lackPartitions []uint64, err error) {
convert := func(node *proto.DataNodeInfo) *DataNodeInfo {
result := &DataNodeInfo{}
result.Addr = node.Addr
result.PersistenceDataPartitions = node.PersistenceDataPartitions
return result
}
var dataNode *proto.DataNodeInfo
for i := 0; i < 3; i++ {
if dataNode, err = MasterClient.NodeAPI().GetDataNode(s.localServerAddr); err != nil {
log.LogErrorf("checkLocalPartitionMatchWithMaster error %v", err)
continue
}
break
}
if dataNode == nil {
err = ErrGetMasterDatanodeInfoFailed
return
}
dinfo := convert(dataNode)
if len(dinfo.PersistenceDataPartitions) == 0 {
return
}
for _, partitionID := range dinfo.PersistenceDataPartitions {
dp := s.space.Partition(partitionID)
if dp == nil {
lackPartitions = append(lackPartitions, partitionID)
}
}
if len(lackPartitions) == 0 {
log.LogInfo("checkLocalPartitionMatchWithMaster no lack")
} else {
log.LogErrorf("checkLocalPartitionMatchWithMaster lack ids [%v]", lackPartitions)
}
return
}
func (s *DataNode) checkPartitionInMemoryMatchWithInDisk() (lackPartitions []uint64) {
s.space.partitionMutex.RLock()
partitions := make([]*DataPartition, 0)
for _, dp := range s.space.partitions {
partitions = append(partitions, dp)
}
s.space.partitionMutex.RUnlock()
for _, dp := range partitions {
stat, err := os.Stat(dp.path)
if err != nil {
lackPartitions = append(lackPartitions, dp.partitionID)
log.LogErrorf("action[checkPartitionInMemoryMatchWithInDisk] stat dataPartition[%v] fail, path[%v], err[%v]", dp.partitionID, dp.Path(), err)
continue
}
if !stat.IsDir() {
lackPartitions = append(lackPartitions, dp.partitionID)
log.LogErrorf("action[checkPartitionInMemoryMatchWithInDisk] dataPartition[%v] is not directory, path[%v]", dp.partitionID, dp.Path())
continue
}
}
return
}
func (s *DataNode) registerHandler() {
http.HandleFunc("/disks", s.getDiskAPI)
http.HandleFunc("/partitions", s.getPartitionsAPI)
http.HandleFunc("/partition", s.getPartitionAPI)
http.HandleFunc("/extent", s.getExtentAPI)
http.HandleFunc("/block", s.getBlockCrcAPI)
http.HandleFunc("/stats", s.getStatAPI)
http.HandleFunc("/raftStatus", s.getRaftStatus)
http.HandleFunc("/setAutoRepairStatus", s.setAutoRepairStatus)
http.HandleFunc("/getTinyDeleted", s.getTinyDeleted)
http.HandleFunc("/getNormalDeleted", s.getNormalDeleted)
http.HandleFunc("/getSmuxPoolStat", s.getSmuxPoolStat())
http.HandleFunc("/setMetricsDegrade", s.setMetricsDegrade)
http.HandleFunc("/getMetricsDegrade", s.getMetricsDegrade)
http.HandleFunc("/qosEnable", s.setQosEnable())
http.HandleFunc("/genClusterVersionFile", s.genClusterVersionFile)
http.HandleFunc("/setDiskBad", s.setDiskBadAPI)
http.HandleFunc("/setDiskQos", s.setDiskQos)
http.HandleFunc("/getDiskQos", s.getDiskQos)
}
func (s *DataNode) startTCPService() (err error) {
log.LogInfo("Start: startTCPService")
addr := fmt.Sprintf(":%v", s.port)
if s.bindIp {
addr = fmt.Sprintf("%s:%v", LocalIP, s.port)
}
l, err := net.Listen(NetworkProtocol, addr)
log.LogDebugf("action[startTCPService] listen %v address(%v).", NetworkProtocol, addr)
if err != nil {
log.LogError("failed to listen, err:", err)
return
}
s.tcpListener = l
go func(ln net.Listener) {
for {
conn, err := ln.Accept()
if err != nil {
log.LogErrorf("action[startTCPService] failed to accept, err:%s", err.Error())
break
}
log.LogDebugf("action[startTCPService] accept connection from %s.", conn.RemoteAddr().String())
go s.serveConn(conn)
}
}(l)
return
}
func (s *DataNode) stopTCPService() (err error) {
if s.tcpListener != nil {
s.tcpListener.Close()
log.LogDebugf("action[stopTCPService] stop tcp service.")
}
return
}
func (s *DataNode) serveConn(conn net.Conn) {
space := s.space
space.Stats().AddConnection()
c, _ := conn.(*net.TCPConn)
c.SetKeepAlive(true)
c.SetNoDelay(true)
packetProcessor := repl.NewReplProtocol(conn, s.Prepare, s.OperatePacket, s.Post)
packetProcessor.ServerConn()
space.Stats().RemoveConnection()
}
func (s *DataNode) startSmuxService(cfg *config.Config) (err error) {
log.LogInfo("Start: startSmuxService")
addr := fmt.Sprintf(":%v", s.port)
if s.bindIp {
addr = fmt.Sprintf("%s:%v", LocalIP, s.port)
}
addr = util.ShiftAddrPort(addr, s.smuxPortShift)
log.LogInfof("SmuxListenAddr: (%v)", addr)
// server
l, err := net.Listen(NetworkProtocol, addr)
log.LogDebugf("action[startSmuxService] listen %v address(%v).", NetworkProtocol, addr)
if err != nil {
log.LogError("failed to listen smux addr, err:", err)
return
}
s.smuxListener = l
go func(ln net.Listener) {
for {
conn, err := ln.Accept()
if err != nil {
log.LogErrorf("action[startSmuxService] failed to accept, err:%s", err.Error())
break
}
log.LogDebugf("action[startSmuxService] accept connection from %s.", conn.RemoteAddr().String())
go s.serveSmuxConn(conn)
}
}(l)
return
}
func (s *DataNode) stopSmuxService() (err error) {
if s.smuxListener != nil {
s.smuxListener.Close()
log.LogDebugf("action[stopSmuxService] stop smux service.")
}
return
}
func (s *DataNode) serveSmuxConn(conn net.Conn) {
space := s.space
space.Stats().AddConnection()
c, _ := conn.(*net.TCPConn)
c.SetKeepAlive(true)
c.SetNoDelay(true)
var sess *smux.Session
var err error
sess, err = smux.Server(conn, s.smuxServerConfig)
if err != nil {
log.LogErrorf("action[serveSmuxConn] failed to serve smux connection, addr(%v), err(%v)", c.RemoteAddr(), err)
return
}
defer func() {
sess.Close()
space.Stats().RemoveConnection()
}()
for {
stream, err := sess.AcceptStream()
if err != nil {
if util.FilterSmuxAcceptError(err) != nil {
log.LogErrorf("action[startSmuxService] failed to accept, err: %s", err)
} else {
log.LogInfof("action[startSmuxService] accept done, err: %s", err)
}
break
}
go s.serveSmuxStream(stream)
}
}
func (s *DataNode) serveSmuxStream(stream *smux.Stream) {
packetProcessor := repl.NewReplProtocol(stream, s.Prepare, s.OperatePacket, s.Post)
if s.enableSmuxConnPool {
packetProcessor.SetSmux(s.getRepairConnFunc, s.putRepairConnFunc)
}
packetProcessor.ServerConn()
}
func (s *DataNode) parseSmuxConfig(cfg *config.Config) error {
s.enableSmuxConnPool = cfg.GetBool(ConfigKeyEnableSmuxClient)
s.smuxPortShift = int(cfg.GetInt64(ConfigKeySmuxPortShift))
if s.smuxPortShift == 0 {
s.smuxPortShift = util.DefaultSmuxPortShift
}
// smux server cfg
s.smuxServerConfig = util.DefaultSmuxConfig()
maxBuffer := cfg.GetInt64(ConfigKeySmuxMaxBuffer)
if maxBuffer > 0 {
s.smuxServerConfig.MaxReceiveBuffer = int(maxBuffer)
if s.smuxServerConfig.MaxStreamBuffer > int(maxBuffer) {
s.smuxServerConfig.MaxStreamBuffer = int(maxBuffer)
}
if err := smux.VerifyConfig(s.smuxServerConfig); err != nil {
return err
}
}
// smux conn pool config
if s.enableSmuxConnPool {
s.smuxConnPoolConfig = util.DefaultSmuxConnPoolConfig()
if maxBuffer > 0 {
s.smuxConnPoolConfig.MaxReceiveBuffer = int(maxBuffer)
if s.smuxConnPoolConfig.MaxStreamBuffer > int(maxBuffer) {
s.smuxConnPoolConfig.MaxStreamBuffer = int(maxBuffer)
}
}
maxConn := cfg.GetInt64(ConfigKeySmuxMaxConn)
if maxConn > 0 {
if s.smuxConnPoolConfig.ConnsPerAddr < int(maxConn) {
s.smuxConnPoolConfig.ConnsPerAddr = int(maxConn)
}
}
maxStreamPerConn := cfg.GetInt64(ConfigKeySmuxStreamPerConn)
if maxStreamPerConn > 0 {
s.smuxConnPoolConfig.StreamsPerConn = int(maxStreamPerConn)
}
totalStreams := cfg.GetInt64(ConfigKeySmuxTotalStream)
if totalStreams > 0 {
s.smuxConnPoolConfig.TotalStreams = int(totalStreams)
}
if err := util.VerifySmuxPoolConfig(s.smuxConnPoolConfig); err != nil {
return err
}
}
log.LogDebugf("[parseSmuxConfig] load smuxPortShift(%v).", s.smuxPortShift)
log.LogDebugf("[parseSmuxConfig] load enableSmuxConnPool(%v).", s.enableSmuxConnPool)
log.LogDebugf("[parseSmuxConfig] load smuxServerConfig(%v).", s.smuxServerConfig)
log.LogDebugf("[parseSmuxConfig] load smuxConnPoolConfig(%v).", s.smuxConnPoolConfig)
return nil
}
func (s *DataNode) initConnPool() {
if s.enableSmuxConnPool {
log.LogInfof("Start: init smux conn pool")
s.smuxConnPool = util.NewSmuxConnectPool(s.smuxConnPoolConfig)
s.getRepairConnFunc = func(target string) (net.Conn, error) {
addr := util.ShiftAddrPort(target, s.smuxPortShift)
log.LogDebugf("[dataNode.getRepairConnFunc] get smux conn, addr(%v)", addr)
return s.smuxConnPool.GetConnect(addr)
}
s.putRepairConnFunc = func(conn net.Conn, forceClose bool) {
log.LogDebugf("[dataNode.putRepairConnFunc] put smux conn, addr(%v), forceClose(%v)", conn.RemoteAddr().String(), forceClose)
s.smuxConnPool.PutConnect(conn.(*smux.Stream), forceClose)
}
} else {
s.getRepairConnFunc = func(target string) (conn net.Conn, err error) {
log.LogDebugf("[dataNode.getRepairConnFunc] get tcp conn, addr(%v)", target)
return gConnPool.GetConnect(target)
}
s.putRepairConnFunc = func(conn net.Conn, forceClose bool) {
log.LogDebugf("[dataNode.putRepairConnFunc] put tcp conn, addr(%v), forceClose(%v)", conn.RemoteAddr().String(), forceClose)
gConnPool.PutConnect(conn.(*net.TCPConn), forceClose)
}
}
}
func (s *DataNode) closeSmuxConnPool() {
if s.smuxConnPool != nil {
s.smuxConnPool.Close()
log.LogDebugf("action[stopSmuxService] stop smux conn pool")
}
}
func (s *DataNode) shallDegrade() bool {
level := atomic.LoadInt64(&s.metricsDegrade)
if level < 0 {
return true
}
if level == 0 {
return false
}
cnt := atomic.LoadUint64(&s.metricsCnt)
return cnt%uint64(level) != 0
}
func (s *DataNode) scheduleTask() {
go s.startUpdateNodeInfo()
s.scheduleToCheckLackPartitions()
}
func (s *DataNode) startCpuSample() {
s.cpuSamplerDone = make(chan struct{})
go func() {
for {
select {
case <-s.cpuSamplerDone:
return
default:
// this function will sleep cpuSampleDuration
used, err := loadutil.GetCpuUtilPercent(cpuSampleDuration)
if err == nil {
s.cpuUtil.Store(used)
}
}
}
}()
}
func (s *DataNode) scheduleToCheckLackPartitions() {
go func() {
for {
lackPartitionsInMem, err := s.checkLocalPartitionMatchWithMaster()
if err != nil {
log.LogError(err)
}
if len(lackPartitionsInMem) > 0 {
err = fmt.Errorf("action[scheduleToLackDataPartitions] lackPartitions %v in datanode %v memory",
lackPartitionsInMem, s.localServerAddr)
log.LogErrorf(err.Error())
}
s.space.stats.updateMetricLackPartitionsInMem(uint64(len(lackPartitionsInMem)))
lackPartitionsInDisk := s.checkPartitionInMemoryMatchWithInDisk()
if len(lackPartitionsInDisk) > 0 {
err = fmt.Errorf("action[scheduleToLackDataPartitions] lackPartitions %v in datanode %v disk",
lackPartitionsInDisk, s.localServerAddr)
log.LogErrorf(err.Error())
}
s.space.stats.updateMetricLackPartitionsInDisk(uint64(len(lackPartitionsInDisk)))
time.Sleep(1 * time.Minute)
}
}()
}
func IsDiskErr(errMsg string) bool {
return strings.Contains(errMsg, syscall.EIO.Error()) ||
strings.Contains(errMsg, syscall.EROFS.Error()) ||
strings.Contains(errMsg, syscall.EACCES.Error())
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"encoding/json"
"fmt"
"net/http"
"os"
"path"
"strconv"
"sync/atomic"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/log"
)
var AutoRepairStatus = true
func (s *DataNode) getDiskAPI(w http.ResponseWriter, r *http.Request) {
disks := make([]interface{}, 0)
for _, diskItem := range s.space.GetDisks() {
disk := &struct {
Path string `json:"path"`
Total uint64 `json:"total"`
Used uint64 `json:"used"`
Available uint64 `json:"available"`
Unallocated uint64 `json:"unallocated"`
Allocated uint64 `json:"allocated"`
Status int `json:"status"`
RestSize uint64 `json:"restSize"`
DiskRdoSize uint64 `json:"diskRdoSize"`
Partitions int `json:"partitions"`
Decommission bool `json:"decommission"`
}{
Path: diskItem.Path,
Total: diskItem.Total,
Used: diskItem.Used,
Available: diskItem.Available,
Unallocated: diskItem.Unallocated,
Allocated: diskItem.Allocated,
Status: diskItem.Status,
RestSize: diskItem.ReservedSpace,
DiskRdoSize: diskItem.DiskRdonlySpace,
Partitions: diskItem.PartitionCount(),
Decommission: diskItem.GetDecommissionStatus(),
}
disks = append(disks, disk)
}
diskReport := &struct {
Disks []interface{} `json:"disks"`
Zone string `json:"zone"`
}{
Disks: disks,
Zone: s.zoneName,
}
s.buildSuccessResp(w, diskReport)
}
func (s *DataNode) getStatAPI(w http.ResponseWriter, r *http.Request) {
response := &proto.DataNodeHeartbeatResponse{}
s.buildHeartBeatResponse(response)
s.buildSuccessResp(w, response)
}
func (s *DataNode) setAutoRepairStatus(w http.ResponseWriter, r *http.Request) {
const (
paramAutoRepair = "autoRepair"
)
if err := r.ParseForm(); err != nil {
err = fmt.Errorf("parse form fail: %v", err)
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
autoRepair, err := strconv.ParseBool(r.FormValue(paramAutoRepair))
if err != nil {
err = fmt.Errorf("parse param %v fail: %v", paramAutoRepair, err)
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
AutoRepairStatus = autoRepair
s.buildSuccessResp(w, autoRepair)
}
func (s *DataNode) getRaftStatus(w http.ResponseWriter, r *http.Request) {
const (
paramRaftID = "raftID"
)
if err := r.ParseForm(); err != nil {
err = fmt.Errorf("parse form fail: %v", err)
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
raftID, err := strconv.ParseUint(r.FormValue(paramRaftID), 10, 64)
if err != nil {
err = fmt.Errorf("parse param %v fail: %v", paramRaftID, err)
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
raftStatus := s.raftStore.RaftStatus(raftID)
s.buildSuccessResp(w, raftStatus)
}
func (s *DataNode) getPartitionsAPI(w http.ResponseWriter, r *http.Request) {
partitions := make([]interface{}, 0)
s.space.RangePartitions(func(dp *DataPartition) bool {
partition := &struct {
ID uint64 `json:"id"`
Size int `json:"size"`
Used int `json:"used"`
Status int `json:"status"`
Path string `json:"path"`
Replicas []string `json:"replicas"`
}{
ID: dp.partitionID,
Size: dp.Size(),
Used: dp.Used(),
Status: dp.Status(),
Path: dp.Path(),
Replicas: dp.Replicas(),
}
partitions = append(partitions, partition)
return true
})
result := &struct {
Partitions []interface{} `json:"partitions"`
PartitionCount int `json:"partitionCount"`
}{
Partitions: partitions,
PartitionCount: len(partitions),
}
s.buildSuccessResp(w, result)
}
func (s *DataNode) getPartitionAPI(w http.ResponseWriter, r *http.Request) {
const (
paramPartitionID = "id"
)
var (
partitionID uint64
files []*storage.ExtentInfo
err error
tinyDeleteRecordSize int64
raftSt *raft.Status
)
if err = r.ParseForm(); err != nil {
err = fmt.Errorf("parse form fail: %v", err)
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if partitionID, err = strconv.ParseUint(r.FormValue(paramPartitionID), 10, 64); err != nil {
err = fmt.Errorf("parse param %v fail: %v", paramPartitionID, err)
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
partition := s.space.Partition(partitionID)
if partition == nil {
s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
return
}
if files, tinyDeleteRecordSize, err = partition.ExtentStore().GetAllWatermarks(nil); err != nil {
err = fmt.Errorf("get watermark fail: %v", err)
s.buildFailureResp(w, http.StatusInternalServerError, err.Error())
return
}
if partition.IsDataPartitionLoading() {
raftSt = &raft.Status{Stopped: true}
} else {
raftSt = partition.raftPartition.Status()
}
result := &struct {
VolName string `json:"volName"`
ID uint64 `json:"id"`
Size int `json:"size"`
Used int `json:"used"`
Status int `json:"status"`
Path string `json:"path"`
Files []*storage.ExtentInfo `json:"extents"`
FileCount int `json:"fileCount"`
Replicas []string `json:"replicas"`
TinyDeleteRecordSize int64 `json:"tinyDeleteRecordSize"`
RaftStatus *raft.Status `json:"raftStatus"`
}{
VolName: partition.volumeID,
ID: partition.partitionID,
Size: partition.Size(),
Used: partition.Used(),
Status: partition.Status(),
Path: partition.Path(),
Files: files,
FileCount: len(files),
Replicas: partition.Replicas(),
TinyDeleteRecordSize: tinyDeleteRecordSize,
RaftStatus: raftSt,
}
if partition.isNormalType() {
result.RaftStatus = partition.raftPartition.Status()
}
s.buildSuccessResp(w, result)
}
func (s *DataNode) getExtentAPI(w http.ResponseWriter, r *http.Request) {
var (
partitionID uint64
extentID int
err error
extentInfo *storage.ExtentInfo
)
if err = r.ParseForm(); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if partitionID, err = strconv.ParseUint(r.FormValue("partitionID"), 10, 64); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if extentID, err = strconv.Atoi(r.FormValue("extentID")); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
partition := s.space.Partition(partitionID)
if partition == nil {
s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
return
}
if extentInfo, err = partition.ExtentStore().Watermark(uint64(extentID)); err != nil {
s.buildFailureResp(w, 500, err.Error())
return
}
s.buildSuccessResp(w, extentInfo)
}
func (s *DataNode) getBlockCrcAPI(w http.ResponseWriter, r *http.Request) {
var (
partitionID uint64
extentID int
err error
blocks []*storage.BlockCrc
)
if err = r.ParseForm(); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if partitionID, err = strconv.ParseUint(r.FormValue("partitionID"), 10, 64); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if extentID, err = strconv.Atoi(r.FormValue("extentID")); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
partition := s.space.Partition(partitionID)
if partition == nil {
s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
return
}
if blocks, err = partition.ExtentStore().ScanBlocks(uint64(extentID)); err != nil {
s.buildFailureResp(w, 500, err.Error())
return
}
s.buildSuccessResp(w, blocks)
}
func (s *DataNode) getTinyDeleted(w http.ResponseWriter, r *http.Request) {
var (
partitionID uint64
err error
extentInfo []storage.ExtentDeleted
)
if err = r.ParseForm(); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if partitionID, err = strconv.ParseUint(r.FormValue("id"), 10, 64); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
partition := s.space.Partition(partitionID)
if partition == nil {
s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
return
}
if extentInfo, err = partition.ExtentStore().GetHasDeleteTinyRecords(); err != nil {
s.buildFailureResp(w, 500, err.Error())
return
}
s.buildSuccessResp(w, extentInfo)
}
func (s *DataNode) getNormalDeleted(w http.ResponseWriter, r *http.Request) {
var (
partitionID uint64
err error
extentInfo []storage.ExtentDeleted
)
if err = r.ParseForm(); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if partitionID, err = strconv.ParseUint(r.FormValue("id"), 10, 64); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
partition := s.space.Partition(partitionID)
if partition == nil {
s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
return
}
if extentInfo, err = partition.ExtentStore().GetHasDeleteExtent(); err != nil {
s.buildFailureResp(w, 500, err.Error())
return
}
s.buildSuccessResp(w, extentInfo)
}
func (s *DataNode) setQosEnable() func(http.ResponseWriter, *http.Request) {
return func(w http.ResponseWriter, r *http.Request) {
var (
err error
enable bool
)
if err = r.ParseForm(); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if enable, err = strconv.ParseBool(r.FormValue("enable")); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
s.diskQosEnable = enable
s.buildSuccessResp(w, "success")
}
}
func (s *DataNode) setDiskQos(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
parser := func(key string) (val int64, err error, has bool) {
valStr := r.FormValue(key)
if valStr == "" {
return 0, nil, false
}
has = true
val, err = strconv.ParseInt(valStr, 10, 64)
return
}
updated := false
for key, pVal := range map[string]*int{
ConfigDiskReadIocc: &s.diskReadIocc,
ConfigDiskReadIops: &s.diskReadIops,
ConfigDiskReadFlow: &s.diskReadFlow,
ConfigDiskWriteIocc: &s.diskWriteIocc,
ConfigDiskWriteIops: &s.diskWriteIops,
ConfigDiskWriteFlow: &s.diskWriteFlow,
} {
val, err, has := parser(key)
if err != nil {
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if has {
updated = true
*pVal = int(val)
}
}
if updated {
s.updateQosLimit()
}
s.buildSuccessResp(w, "success")
}
func (s *DataNode) getDiskQos(w http.ResponseWriter, r *http.Request) {
disks := make([]interface{}, 0)
for _, diskItem := range s.space.GetDisks() {
disk := &struct {
Path string `json:"path"`
Read LimiterStatus `json:"read"`
Write LimiterStatus `json:"write"`
}{
Path: diskItem.Path,
Read: diskItem.limitRead.Status(),
Write: diskItem.limitWrite.Status(),
}
disks = append(disks, disk)
}
diskStatus := &struct {
Disks []interface{} `json:"disks"`
Zone string `json:"zone"`
}{
Disks: disks,
Zone: s.zoneName,
}
s.buildSuccessResp(w, diskStatus)
}
func (s *DataNode) getSmuxPoolStat() func(http.ResponseWriter, *http.Request) {
return func(w http.ResponseWriter, r *http.Request) {
if !s.enableSmuxConnPool {
s.buildFailureResp(w, 500, "smux pool not supported")
return
}
if s.smuxConnPool == nil {
s.buildFailureResp(w, 500, "smux pool now is nil")
return
}
stat := s.smuxConnPool.GetStat()
s.buildSuccessResp(w, stat)
}
}
func (s *DataNode) setMetricsDegrade(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
w.Write([]byte(err.Error()))
return
}
if level := r.FormValue("level"); level != "" {
val, err := strconv.Atoi(level)
if err != nil {
w.Write([]byte("Set metrics degrade level failed\n"))
} else {
atomic.StoreInt64(&s.metricsDegrade, int64(val))
w.Write([]byte(fmt.Sprintf("Set metrics degrade level to %v successfully\n", val)))
}
}
}
func (s *DataNode) getMetricsDegrade(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(fmt.Sprintf("%v\n", atomic.LoadInt64(&s.metricsDegrade))))
}
func (s *DataNode) genClusterVersionFile(w http.ResponseWriter, r *http.Request) {
paths := make([]string, 0)
s.space.RangePartitions(func(partition *DataPartition) bool {
paths = append(paths, partition.disk.Path)
return true
})
paths = append(paths, s.raftDir)
for _, p := range paths {
if _, err := os.Stat(path.Join(p, config.ClusterVersionFile)); err == nil || os.IsExist(err) {
s.buildFailureResp(w, http.StatusCreated, "cluster version file already exists in "+p)
return
}
}
for _, p := range paths {
if err := config.CheckOrStoreClusterUuid(p, s.clusterUuid, true); err != nil {
s.buildFailureResp(w, http.StatusInternalServerError, "Failed to create cluster version file in "+p)
return
}
}
s.buildSuccessResp(w, "Generate cluster version file success")
}
func (s *DataNode) buildSuccessResp(w http.ResponseWriter, data interface{}) {
s.buildJSONResp(w, http.StatusOK, data, "")
}
func (s *DataNode) buildFailureResp(w http.ResponseWriter, code int, msg string) {
s.buildJSONResp(w, code, nil, msg)
}
// Create response for the API request.
func (s *DataNode) buildJSONResp(w http.ResponseWriter, code int, data interface{}, msg string) {
var (
jsonBody []byte
err error
)
w.WriteHeader(code)
w.Header().Set("Content-Type", "application/json")
body := proto.HTTPReply{Code: int32(code), Msg: msg, Data: data}
if jsonBody, err = json.Marshal(body); err != nil {
return
}
w.Write(jsonBody)
}
func (s *DataNode) setDiskBadAPI(w http.ResponseWriter, r *http.Request) {
const (
paramDiskPath = "diskPath"
)
var (
err error
diskPath string
disk *Disk
)
if err = r.ParseForm(); err != nil {
err = fmt.Errorf("parse form fail: %v", err)
log.LogErrorf("[setDiskBadAPI] %v", err.Error())
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if diskPath = r.FormValue(paramDiskPath); diskPath == "" {
err = fmt.Errorf("param(%v) is empty", paramDiskPath)
log.LogErrorf("[setDiskBadAPI] %v", err.Error())
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if disk, err = s.space.GetDisk(diskPath); err != nil {
err = fmt.Errorf("not exit such dissk, path: %v", diskPath)
log.LogErrorf("[setDiskBadAPI] %v", err.Error())
s.buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
if disk.Status == proto.Unavailable {
msg := fmt.Sprintf("disk(%v) status was already unavailable, nothing to do", disk.Path)
log.LogInfof("[setDiskBadAPI] %v", msg)
s.buildSuccessResp(w, msg)
return
}
log.LogWarnf("[setDiskBadAPI] set bad disk, path: %v", disk.Path)
disk.doDiskError()
s.buildSuccessResp(w, "OK")
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"fmt"
"math"
"os"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/util/atomicutil"
"github.com/cubefs/cubefs/util/loadutil"
"github.com/cubefs/cubefs/util/log"
"github.com/shirou/gopsutil/disk"
)
// SpaceManager manages the disk space.
type SpaceManager struct {
clusterID string
disks map[string]*Disk
partitions map[uint64]*DataPartition
raftStore raftstore.RaftStore
nodeID uint64
diskMutex sync.RWMutex
partitionMutex sync.RWMutex
stats *Stats
stopC chan bool
selectedIndex int // TODO what is selected index
diskList []string
dataNode *DataNode
createPartitionMutex sync.RWMutex
diskUtils map[string]*atomicutil.Float64
samplerDone chan struct{}
}
const diskSampleDuration = 1 * time.Second
// NewSpaceManager creates a new space manager.
func NewSpaceManager(dataNode *DataNode) *SpaceManager {
space := &SpaceManager{}
space.disks = make(map[string]*Disk)
space.diskList = make([]string, 0)
space.partitions = make(map[uint64]*DataPartition)
space.stats = NewStats(dataNode.zoneName)
space.stopC = make(chan bool)
space.dataNode = dataNode
space.diskUtils = make(map[string]*atomicutil.Float64)
go space.statUpdateScheduler()
return space
}
func (manager *SpaceManager) Stop() {
defer func() {
recover()
}()
close(manager.stopC)
// stop sampler
close(manager.samplerDone)
// Parallel stop data partitions.
const maxParallelism = 128
parallelism := int(math.Min(float64(maxParallelism), float64(len(manager.partitions))))
wg := sync.WaitGroup{}
partitionC := make(chan *DataPartition, parallelism)
wg.Add(1)
// Close raft store.
for _, partition := range manager.partitions {
partition.stopRaft()
}
go func(c chan<- *DataPartition) {
defer wg.Done()
for _, partition := range manager.partitions {
c <- partition
}
close(c)
}(partitionC)
for i := 0; i < parallelism; i++ {
wg.Add(1)
go func(c <-chan *DataPartition) {
defer wg.Done()
var partition *DataPartition
for {
if partition = <-c; partition == nil {
return
}
partition.Stop()
}
}(partitionC)
}
wg.Wait()
}
func (manager *SpaceManager) GetAllDiskPartitions() []*disk.PartitionStat {
manager.diskMutex.RLock()
defer manager.diskMutex.RUnlock()
partitions := make([]*disk.PartitionStat, 0, len(manager.disks))
for _, disk := range manager.disks {
partition := disk.GetDiskPartition()
if partition != nil {
partitions = append(partitions, partition)
}
}
return partitions
}
func (manager *SpaceManager) FillIoUtils(samples map[string]loadutil.DiskIoSample) {
manager.diskMutex.RLock()
defer manager.diskMutex.RUnlock()
for _, sample := range samples {
util := manager.diskUtils[sample.GetPartition().Device]
if util != nil {
util.Store(sample.GetIoUtilPercent())
}
}
}
func (manager *SpaceManager) StartDiskSample() {
manager.samplerDone = make(chan struct{})
go func() {
for {
select {
case <-manager.samplerDone:
return
default:
partitions := manager.GetAllDiskPartitions()
samples, err := loadutil.GetDisksIoSample(partitions, diskSampleDuration)
if err != nil {
log.LogErrorf("failed to sample disk %v\n", err.Error())
return
}
manager.FillIoUtils(samples)
}
}
}()
}
func (manager *SpaceManager) GetDiskUtils() map[string]float64 {
utils := make(map[string]float64)
manager.diskMutex.RLock()
defer manager.diskMutex.RUnlock()
for device, used := range manager.diskUtils {
utils[device] = used.Load()
}
return utils
}
func (manager *SpaceManager) SetNodeID(nodeID uint64) {
manager.nodeID = nodeID
}
func (manager *SpaceManager) GetNodeID() (nodeID uint64) {
return manager.nodeID
}
func (manager *SpaceManager) SetClusterID(clusterID string) {
manager.clusterID = clusterID
}
func (manager *SpaceManager) GetClusterID() (clusterID string) {
return manager.clusterID
}
func (manager *SpaceManager) SetRaftStore(raftStore raftstore.RaftStore) {
manager.raftStore = raftStore
}
func (manager *SpaceManager) GetRaftStore() (raftStore raftstore.RaftStore) {
return manager.raftStore
}
func (manager *SpaceManager) RangePartitions(f func(partition *DataPartition) bool) {
if f == nil {
return
}
manager.partitionMutex.RLock()
partitions := make([]*DataPartition, 0)
for _, dp := range manager.partitions {
partitions = append(partitions, dp)
}
manager.partitionMutex.RUnlock()
for _, partition := range partitions {
if !f(partition) {
break
}
}
}
func (manager *SpaceManager) GetDisks() (disks []*Disk) {
manager.diskMutex.RLock()
defer manager.diskMutex.RUnlock()
disks = make([]*Disk, 0)
for _, disk := range manager.disks {
disks = append(disks, disk)
}
return
}
func (manager *SpaceManager) Stats() *Stats {
return manager.stats
}
func (manager *SpaceManager) LoadDisk(path string, reservedSpace, diskRdonlySpace uint64, maxErrCnt int) (err error) {
var (
disk *Disk
visitor PartitionVisitor
)
if diskRdonlySpace < reservedSpace {
diskRdonlySpace = reservedSpace
}
log.LogDebugf("action[LoadDisk] load disk from path(%v).", path)
visitor = func(dp *DataPartition) {
manager.partitionMutex.Lock()
defer manager.partitionMutex.Unlock()
if _, has := manager.partitions[dp.partitionID]; !has {
manager.partitions[dp.partitionID] = dp
log.LogDebugf("action[LoadDisk] put partition(%v) to manager manager.", dp.partitionID)
}
}
if _, err = manager.GetDisk(path); err != nil {
disk, err = NewDisk(path, reservedSpace, diskRdonlySpace, maxErrCnt, manager)
if err != nil {
log.LogErrorf("NewDisk fail err:[%v]", err)
return
}
err = disk.RestorePartition(visitor)
if err != nil {
log.LogErrorf("RestorePartition fail err:[%v]", err)
return
}
manager.putDisk(disk)
err = nil
go disk.doBackendTask()
}
return
}
func (manager *SpaceManager) GetDisk(path string) (d *Disk, err error) {
manager.diskMutex.RLock()
defer manager.diskMutex.RUnlock()
disk, has := manager.disks[path]
if has && disk != nil {
d = disk
return
}
err = fmt.Errorf("disk(%v) not exsit", path)
return
}
func (manager *SpaceManager) putDisk(d *Disk) {
manager.diskMutex.Lock()
manager.disks[d.Path] = d
manager.diskList = append(manager.diskList, d.Path)
if d.GetDiskPartition() != nil {
manager.diskUtils[d.GetDiskPartition().Device] = &atomicutil.Float64{}
manager.diskUtils[d.GetDiskPartition().Device].Store(0)
}
manager.diskMutex.Unlock()
}
func (manager *SpaceManager) updateMetrics() {
manager.diskMutex.RLock()
var (
total, used, available uint64
totalPartitionSize, remainingCapacityToCreatePartition uint64
maxCapacityToCreatePartition, partitionCnt uint64
)
maxCapacityToCreatePartition = 0
for _, d := range manager.disks {
if d.Status == proto.Unavailable {
log.LogInfof("disk is broken, not stat disk useage, diskpath %s", d.Path)
continue
}
total += d.Total
used += d.Used
available += d.Available
totalPartitionSize += d.Allocated
remainingCapacityToCreatePartition += d.Unallocated
partitionCnt += uint64(d.PartitionCount())
if maxCapacityToCreatePartition < d.Unallocated {
maxCapacityToCreatePartition = d.Unallocated
}
}
manager.diskMutex.RUnlock()
log.LogDebugf("action[updateMetrics] total(%v) used(%v) available(%v) totalPartitionSize(%v) remainingCapacityToCreatePartition(%v) "+
"partitionCnt(%v) maxCapacityToCreatePartition(%v) ", total, used, available, totalPartitionSize, remainingCapacityToCreatePartition, partitionCnt, maxCapacityToCreatePartition)
manager.stats.updateMetrics(total, used, available, totalPartitionSize,
remainingCapacityToCreatePartition, maxCapacityToCreatePartition, partitionCnt)
}
func (manager *SpaceManager) minPartitionCnt(decommissionedDisks []string) (d *Disk) {
manager.diskMutex.Lock()
defer manager.diskMutex.Unlock()
var (
minWeight float64
minWeightDisk *Disk
)
decommissionedDiskMap := make(map[string]struct{})
for _, disk := range decommissionedDisks {
decommissionedDiskMap[disk] = struct{}{}
}
minWeight = math.MaxFloat64
for _, disk := range manager.disks {
if _, ok := decommissionedDiskMap[disk.Path]; ok {
log.LogInfof("action[minPartitionCnt] exclude decommissioned disk[%v]", disk.Path)
continue
}
if disk.Status != proto.ReadWrite {
continue
}
diskWeight := disk.getSelectWeight()
if diskWeight < minWeight {
minWeight = diskWeight
minWeightDisk = disk
}
}
if minWeightDisk == nil {
return
}
if minWeightDisk.Status != proto.ReadWrite {
return
}
d = minWeightDisk
return d
}
func (manager *SpaceManager) statUpdateScheduler() {
go func() {
ticker := time.NewTicker(10 * time.Second)
for {
select {
case <-ticker.C:
manager.updateMetrics()
case <-manager.stopC:
ticker.Stop()
return
}
}
}()
}
func (manager *SpaceManager) Partition(partitionID uint64) (dp *DataPartition) {
manager.partitionMutex.RLock()
defer manager.partitionMutex.RUnlock()
dp = manager.partitions[partitionID]
return
}
func (manager *SpaceManager) AttachPartition(dp *DataPartition) {
manager.partitionMutex.Lock()
defer manager.partitionMutex.Unlock()
manager.partitions[dp.partitionID] = dp
}
// DetachDataPartition removes a data partition from the partition map.
func (manager *SpaceManager) DetachDataPartition(partitionID uint64) {
manager.partitionMutex.Lock()
defer manager.partitionMutex.Unlock()
delete(manager.partitions, partitionID)
}
func (manager *SpaceManager) CreatePartition(request *proto.CreateDataPartitionRequest) (dp *DataPartition, err error) {
manager.partitionMutex.Lock()
defer manager.partitionMutex.Unlock()
dpCfg := &dataPartitionCfg{
PartitionID: request.PartitionId,
VolName: request.VolumeId,
Peers: request.Members,
Hosts: request.Hosts,
RaftStore: manager.raftStore,
NodeID: manager.nodeID,
ClusterID: manager.clusterID,
PartitionSize: request.PartitionSize,
PartitionType: int(request.PartitionTyp),
ReplicaNum: request.ReplicaNum,
VerSeq: request.VerSeq,
CreateType: request.CreateType,
Forbidden: false,
}
log.LogInfof("action[CreatePartition] dp %v dpCfg.Peers %v request.Members %v",
dpCfg.PartitionID, dpCfg.Peers, request.Members)
dp = manager.partitions[dpCfg.PartitionID]
if dp != nil {
if err = dp.IsEquareCreateDataPartitionRequst(request); err != nil {
return nil, err
}
return
}
disk := manager.minPartitionCnt(request.DecommissionedDisks)
if disk == nil {
return nil, ErrNoSpaceToCreatePartition
}
if dp, err = CreateDataPartition(dpCfg, disk, request); err != nil {
return
}
manager.partitions[dp.partitionID] = dp
return
}
// DeletePartition deletes a partition based on the partition id.
func (manager *SpaceManager) DeletePartition(dpID uint64) {
manager.partitionMutex.Lock()
dp := manager.partitions[dpID]
if dp == nil {
manager.partitionMutex.Unlock()
return
}
delete(manager.partitions, dpID)
manager.partitionMutex.Unlock()
dp.Stop()
dp.Disk().DetachDataPartition(dp)
os.RemoveAll(dp.Path())
}
func (s *DataNode) buildHeartBeatResponse(response *proto.DataNodeHeartbeatResponse) {
response.Status = proto.TaskSucceeds
stat := s.space.Stats()
stat.Lock()
response.Used = stat.Used
response.Total = stat.Total
response.Available = stat.Available
response.CreatedPartitionCnt = uint32(stat.CreatedPartitionCnt)
response.TotalPartitionSize = stat.TotalPartitionSize
response.MaxCapacity = stat.MaxCapacityToCreatePartition
response.RemainingCapacity = stat.RemainingCapacityToCreatePartition
response.BadDisks = make([]string, 0)
response.BadDiskStats = make([]proto.BadDiskStat, 0)
response.StartTime = s.startTime
stat.Unlock()
response.ZoneName = s.zoneName
response.PartitionReports = make([]*proto.DataPartitionReport, 0)
space := s.space
space.RangePartitions(func(partition *DataPartition) bool {
leaderAddr, isLeader := partition.IsRaftLeader()
vr := &proto.DataPartitionReport{
VolName: partition.volumeID,
PartitionID: uint64(partition.partitionID),
PartitionStatus: partition.Status(),
Total: uint64(partition.Size()),
Used: uint64(partition.Used()),
DiskPath: partition.Disk().Path,
IsLeader: isLeader,
ExtentCount: partition.GetExtentCount(),
NeedCompare: true,
DecommissionRepairProgress: partition.decommissionRepairProgress,
}
log.LogDebugf("action[Heartbeats] dpid(%v), status(%v) total(%v) used(%v) leader(%v) isLeader(%v).", vr.PartitionID, vr.PartitionStatus, vr.Total, vr.Used, leaderAddr, vr.IsLeader)
response.PartitionReports = append(response.PartitionReports, vr)
return true
})
disks := space.GetDisks()
for _, d := range disks {
if d.Status == proto.Unavailable {
response.BadDisks = append(response.BadDisks, d.Path)
bds := proto.BadDiskStat{
DiskPath: d.Path,
TotalPartitionCnt: d.PartitionCount(),
DiskErrPartitionList: d.GetDiskErrPartitionList(),
}
response.BadDiskStats = append(response.BadDiskStats, bds)
}
}
}
func (manager *SpaceManager) getPartitionIds() []uint64 {
res := make([]uint64, 0)
for id := range manager.partitions {
res = append(res, id)
}
return res
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"sync"
"sync/atomic"
"time"
)
// Stats defines various metrics that will be collected during the execution.
type Stats struct {
inDataSize uint64
outDataSize uint64
inFlow uint64
outFlow uint64
Zone string
ConnectionCnt int64
ClusterID string
TCPAddr string
Start time.Time
Total uint64
Used uint64
Available uint64 // available space
TotalPartitionSize uint64 // dataPartitionCnt * dataPartitionSize
RemainingCapacityToCreatePartition uint64
CreatedPartitionCnt uint64
LackPartitionsInMem uint64
LackPartitionsInDisk uint64
// the maximum capacity among all the disks that can be used to create partition
MaxCapacityToCreatePartition uint64
sync.Mutex
}
// NewStats creates a new Stats.
func NewStats(zone string) (s *Stats) {
s = new(Stats)
s.Zone = zone
return s
}
// AddConnection adds a connection.
func (s *Stats) AddConnection() {
atomic.AddInt64(&s.ConnectionCnt, 1)
}
// RemoveConnection removes a connection.
func (s *Stats) RemoveConnection() {
atomic.AddInt64(&s.ConnectionCnt, -1)
}
// GetConnectionCount gets the connection count.
func (s *Stats) GetConnectionCount() int64 {
return atomic.LoadInt64(&s.ConnectionCnt)
}
func (s *Stats) updateMetrics(
total, used, available, createdPartitionWeights, remainWeightsForCreatePartition,
maxWeightsForCreatePartition, dataPartitionCnt uint64) {
s.Lock()
defer s.Unlock()
s.Total = total
s.Used = used
s.Available = available
s.TotalPartitionSize = createdPartitionWeights
s.RemainingCapacityToCreatePartition = remainWeightsForCreatePartition
s.MaxCapacityToCreatePartition = maxWeightsForCreatePartition
s.CreatedPartitionCnt = dataPartitionCnt
}
func (s *Stats) updateMetricLackPartitionsInMem(lackPartitionsInMem uint64) {
s.Lock()
defer s.Unlock()
s.LackPartitionsInMem = lackPartitionsInMem
}
func (s *Stats) updateMetricLackPartitionsInDisk(lackPartitionsInDisk uint64) {
s.Lock()
defer s.Unlock()
s.LackPartitionsInDisk = lackPartitionsInDisk
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"hash/crc32"
"net"
"strconv"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
raftProto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
var ErrForbiddenDataPartition = errors.New("the data partition is forbidden")
func (s *DataNode) getPacketTpLabels(p *repl.Packet) map[string]string {
labels := make(map[string]string)
labels[exporter.Vol] = ""
labels[exporter.Op] = ""
labels[exporter.PartId] = ""
labels[exporter.Disk] = ""
if part, ok := p.Object.(*DataPartition); ok {
labels[exporter.Vol] = part.volumeID
labels[exporter.Op] = p.GetOpMsg()
if exporter.EnablePid {
labels[exporter.PartId] = fmt.Sprintf("%d", part.partitionID)
labels[exporter.Disk] = part.path
}
}
return labels
}
func isColdVolExtentDelErr(p *repl.Packet) bool {
if p.Object == nil {
return false
}
partition, ok := p.Object.(*DataPartition)
if !ok {
return false
}
if proto.IsNormalDp(partition.partitionType) {
return false
}
if p.ResultCode == proto.OpNotExistErr {
return true
}
return false
}
func (s *DataNode) OperatePacket(p *repl.Packet, c net.Conn) (err error) {
var (
tpLabels map[string]string
tpObject *exporter.TimePointCount
)
log.LogDebugf("action[OperatePacket] %v, pack [%v]", p.GetOpMsg(), p)
shallDegrade := p.ShallDegrade()
sz := p.Size
if !shallDegrade {
tpObject = exporter.NewTPCnt(p.GetOpMsg())
tpLabels = s.getPacketTpLabels(p)
}
start := time.Now().UnixNano()
defer func() {
resultSize := p.Size
p.Size = sz
if p.IsErrPacket() {
err = fmt.Errorf("op(%v) error(%v)", p.GetOpMsg(), string(p.Data[:resultSize]))
logContent := fmt.Sprintf("action[OperatePacket] %v.",
p.LogMessage(p.GetOpMsg(), c.RemoteAddr().String(), start, err))
if isColdVolExtentDelErr(p) {
log.LogInfof(logContent)
} else {
log.LogErrorf(logContent)
}
} else {
logContent := fmt.Sprintf("action[OperatePacket] %v.",
p.LogMessage(p.GetOpMsg(), c.RemoteAddr().String(), start, nil))
switch p.Opcode {
case proto.OpStreamRead, proto.OpRead, proto.OpExtentRepairRead, proto.OpStreamFollowerRead:
case proto.OpReadTinyDeleteRecord:
log.LogRead(logContent)
case proto.OpWrite, proto.OpRandomWrite,
proto.OpRandomWriteVer, proto.OpSyncRandomWriteVer,
proto.OpRandomWriteAppend, proto.OpSyncRandomWriteAppend,
proto.OpTryWriteAppend, proto.OpSyncTryWriteAppend,
proto.OpSyncRandomWrite, proto.OpSyncWrite, proto.OpMarkDelete, proto.OpSplitMarkDelete:
log.LogWrite(logContent)
default:
log.LogInfo(logContent)
}
}
p.Size = resultSize
if !shallDegrade {
tpObject.SetWithLabels(err, tpLabels)
}
}()
switch p.Opcode {
case proto.OpCreateExtent:
s.handlePacketToCreateExtent(p)
case proto.OpWrite, proto.OpSyncWrite:
s.handleWritePacket(p)
case proto.OpStreamRead:
s.handleStreamReadPacket(p, c, StreamRead)
case proto.OpStreamFollowerRead:
s.extentRepairReadPacket(p, c, StreamRead)
case proto.OpExtentRepairRead:
s.handleExtentRepairReadPacket(p, c, RepairRead)
case proto.OpTinyExtentRepairRead:
s.handleTinyExtentRepairReadPacket(p, c)
case proto.OpMarkDelete, proto.OpSplitMarkDelete:
s.handleMarkDeletePacket(p, c)
case proto.OpBatchDeleteExtent:
s.handleBatchMarkDeletePacket(p, c)
case proto.OpRandomWrite, proto.OpSyncRandomWrite,
proto.OpRandomWriteAppend, proto.OpSyncRandomWriteAppend,
proto.OpTryWriteAppend, proto.OpSyncTryWriteAppend,
proto.OpRandomWriteVer, proto.OpSyncRandomWriteVer:
s.handleRandomWritePacket(p)
case proto.OpNotifyReplicasToRepair:
s.handlePacketToNotifyExtentRepair(p)
case proto.OpGetAllWatermarks:
s.handlePacketToGetAllWatermarks(p)
case proto.OpCreateDataPartition:
s.handlePacketToCreateDataPartition(p)
case proto.OpLoadDataPartition:
s.handlePacketToLoadDataPartition(p)
case proto.OpDeleteDataPartition:
s.handlePacketToDeleteDataPartition(p)
case proto.OpDataNodeHeartbeat:
s.handleHeartbeatPacket(p)
case proto.OpGetAppliedId:
s.handlePacketToGetAppliedID(p)
case proto.OpDecommissionDataPartition:
s.handlePacketToDecommissionDataPartition(p)
case proto.OpAddDataPartitionRaftMember:
s.handlePacketToAddDataPartitionRaftMember(p)
case proto.OpRemoveDataPartitionRaftMember:
s.handlePacketToRemoveDataPartitionRaftMember(p)
case proto.OpDataPartitionTryToLeader:
s.handlePacketToDataPartitionTryToLeader(p)
case proto.OpGetPartitionSize:
s.handlePacketToGetPartitionSize(p)
case proto.OpGetMaxExtentIDAndPartitionSize:
s.handlePacketToGetMaxExtentIDAndPartitionSize(p)
case proto.OpReadTinyDeleteRecord:
s.handlePacketToReadTinyDeleteRecordFile(p, c)
case proto.OpBroadcastMinAppliedID:
s.handleBroadcastMinAppliedID(p)
case proto.OpVersionOperation:
s.handleUpdateVerPacket(p)
case proto.OpStopDataPartitionRepair:
s.handlePacketToStopDataPartitionRepair(p)
default:
p.PackErrorBody(repl.ErrorUnknownOp.Error(), repl.ErrorUnknownOp.Error()+strconv.Itoa(int(p.Opcode)))
}
return
}
// Handle OpCreateExtent packet.
func (s *DataNode) handlePacketToCreateExtent(p *repl.Packet) {
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionCreateExtent, err.Error())
} else {
p.PacketOkReply()
}
}()
partition := p.Object.(*DataPartition)
if partition.Available() <= 0 || !partition.disk.CanWrite() {
err = storage.NoSpaceError
return
} else if partition.disk.Status == proto.Unavailable {
err = storage.BrokenDiskError
return
}
// in case too many extents
if partition.GetExtentCount() >= storage.MaxExtentCount+10 {
err = storage.NoSpaceError
return
}
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
partition.disk.limitWrite.Run(0, func() {
err = partition.ExtentStore().Create(p.ExtentID)
})
}
// Handle OpCreateDataPartition packet.
func (s *DataNode) handlePacketToCreateDataPartition(p *repl.Packet) {
var (
err error
bytes []byte
dp *DataPartition
)
defer func() {
if err != nil {
p.PackErrorBody(ActionCreateDataPartition, err.Error())
}
}()
task := &proto.AdminTask{}
if err = json.Unmarshal(p.Data, task); err != nil {
err = fmt.Errorf("cannnot unmashal adminTask")
return
}
request := &proto.CreateDataPartitionRequest{}
if task.OpCode != proto.OpCreateDataPartition {
err = fmt.Errorf("from master Task(%v) failed,error unavali opcode(%v)", task.ToString(), task.OpCode)
return
}
bytes, err = json.Marshal(task.Request)
if err != nil {
err = fmt.Errorf("from master Task(%v) cannot unmashal CreateDataPartition, err %s", task.ToString(), err.Error())
return
}
p.AddMesgLog(string(bytes))
if err = json.Unmarshal(bytes, request); err != nil {
err = fmt.Errorf("from master Task(%v) cannot unmashal CreateDataPartitionRequest struct, err(%s)", task.ToString(), err.Error())
return
}
p.PartitionID = request.PartitionId
if dp, err = s.space.CreatePartition(request); err != nil {
err = fmt.Errorf("from master Task(%v) cannot create Partition err(%v)", task.ToString(), err)
return
}
p.PacketOkWithBody([]byte(dp.Disk().Path))
}
func (s *DataNode) commitDelVersion(volumeID string, verSeq uint64) (err error) {
for _, partition := range s.space.partitions {
if partition.config.VolName != volumeID {
continue
}
verListMgr := partition.volVersionInfoList
verListMgr.RWLock.Lock()
for i, ver := range verListMgr.VerList {
if i == len(verListMgr.VerList)-1 {
log.LogWarnf("action[commitDelVersion] dp[%v] seq %v, seqArray size %v newest ver %v",
partition.config.PartitionID, verSeq, len(verListMgr.VerList), ver.Ver)
break
}
if ver.Ver == verSeq {
log.LogInfof("action[commitDelVersion] updateVerList dp[%v] seq %v,seqArray size %v", partition.config.PartitionID, verSeq, len(verListMgr.VerList))
verListMgr.VerList = append(verListMgr.VerList[:i], verListMgr.VerList[i+1:]...)
break
}
}
verListMgr.RWLock.Unlock()
}
return
}
func (s *DataNode) commitCreateVersion(req *proto.MultiVersionOpRequest) (err error) {
log.LogInfof("action[commitCreateVersion] handle master version reqeust %v", req)
var (
value interface{}
ok bool
wg sync.WaitGroup
)
if value, ok = s.volUpdating.Load(req.VolumeID); !ok {
log.LogWarnf("action[commitCreateVersion] vol %v not found seq %v", req.VolumeID, req.VerSeq)
return
}
ver2Phase := value.(*verOp2Phase)
log.LogInfof("action[commitCreateVersion] try commit volume %v ver2Phase seq %v with req seq %v",
req.VolumeID, ver2Phase.verPrepare, req.VerSeq)
if req.VerSeq < ver2Phase.verSeq {
log.LogWarnf("action[commitCreateVersion] vol %v seq %v create less than loal %v", req.VolumeID, req.VerSeq, ver2Phase.verSeq)
return
}
if ver2Phase.step != proto.CreateVersionPrepare {
log.LogWarnf("action[commitCreateVersion] vol %v seq %v step not prepare", req.VolumeID, ver2Phase.step)
}
s.space.partitionMutex.RLock()
defer s.space.partitionMutex.RUnlock()
resultCh := make(chan error, len(s.space.partitions))
for _, partition := range s.space.partitions {
if partition.config.VolName != req.VolumeID {
continue
}
if !partition.isRaftLeader {
continue
}
wg.Add(1)
go func(partition *DataPartition) {
defer wg.Done()
log.LogInfof("action[commitCreateVersion] volume %v dp[%v] do HandleVersionOp verSeq[%v]",
partition.volumeID, partition.partitionID, partition.verSeq)
if err = partition.HandleVersionOp(req); err != nil {
log.LogErrorf("action[commitCreateVersion] volume %v dp[%v] do HandleVersionOp verSeq[%v] err %v",
partition.volumeID, partition.partitionID, partition.verSeq, err)
resultCh <- err
return
}
}(partition)
}
wg.Wait()
select {
case err = <-resultCh:
if err != nil {
close(resultCh)
return
}
default:
log.LogInfof("action[commitCreateVersion] volume %v do HandleVersionOp verseq [%v] finished", req.VolumeID, req.VerSeq)
}
close(resultCh)
if req.Op == proto.DeleteVersion {
return
}
if req.Op == proto.CreateVersionPrepare {
log.LogInfof("action[commitCreateVersion] commit volume %v prepare seq %v with commit seq %v",
req.VolumeID, ver2Phase.verPrepare, req.VerSeq)
return
}
ver2Phase.verSeq = req.VerSeq
ver2Phase.step = proto.CreateVersionCommit
ver2Phase.status = proto.VersionWorkingFinished
log.LogInfof("action[commitCreateVersion] commit volume %v prepare seq %v with commit seq %v",
req.VolumeID, ver2Phase.verPrepare, req.VerSeq)
return
}
func (s *DataNode) prepareCreateVersion(req *proto.MultiVersionOpRequest) (err error, opAagin bool) {
var ver2Phase *verOp2Phase
if value, ok := s.volUpdating.Load(req.VolumeID); ok {
ver2Phase = value.(*verOp2Phase)
if req.VerSeq < ver2Phase.verSeq {
err = fmt.Errorf("seq %v create less than loal %v", req.VerSeq, ver2Phase.verSeq)
log.LogInfof("action[prepareCreateVersion] volume %v update to ver %v step %v", req.VolumeID, req.VerSeq, ver2Phase.step)
return
} else if req.VerSeq == ver2Phase.verPrepare {
if ver2Phase.step == proto.VersionWorking {
opAagin = true
return
}
}
}
ver2Phase = &verOp2Phase{}
ver2Phase.step = uint32(req.Op)
ver2Phase.status = proto.VersionWorking
ver2Phase.verPrepare = req.VerSeq
s.volUpdating.Store(req.VolumeID, ver2Phase)
log.LogInfof("action[prepareCreateVersion] volume %v update seq to %v step %v",
req.VolumeID, req.VerSeq, ver2Phase.step)
return
}
// Handle OpHeartbeat packet.
func (s *DataNode) handleUpdateVerPacket(p *repl.Packet) {
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionUpdateVersion, err.Error())
} else {
p.PacketOkReply()
}
}()
task := &proto.AdminTask{}
err = json.Unmarshal(p.Data, task)
if err != nil {
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
return
}
request := &proto.MultiVersionOpRequest{}
response := &proto.MultiVersionOpResponse{}
response.Op = task.OpCode
response.Status = proto.TaskSucceeds
if task.OpCode == proto.OpVersionOperation {
marshaled, _ := json.Marshal(task.Request)
if err = json.Unmarshal(marshaled, request); err != nil {
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
response.Status = proto.TaskFailed
goto end
}
if request.Op == proto.CreateVersionPrepare {
if err, _ = s.prepareCreateVersion(request); err != nil {
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
goto end
}
if err = s.commitCreateVersion(request); err != nil {
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
goto end
}
} else if request.Op == proto.CreateVersionCommit {
if err = s.commitCreateVersion(request); err != nil {
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
goto end
}
} else if request.Op == proto.DeleteVersion {
if err = s.commitDelVersion(request.VolumeID, request.VerSeq); err != nil {
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
goto end
}
}
response.VerSeq = request.VerSeq
response.Op = request.Op
response.Addr = request.Addr
response.VolumeID = request.VolumeID
} else {
err = fmt.Errorf("illegal opcode")
log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
goto end
}
end:
if err != nil {
response.Result = err.Error()
}
task.Response = response
log.LogInfof("action[handleUpdateVerPacket] rsp to client,req vol %v, verseq %v, op %v", request.VolumeID, request.VerSeq, request.Op)
if err = MasterClient.NodeAPI().ResponseDataNodeTask(task); err != nil {
err = errors.Trace(err, "handleUpdateVerPacket to master failed.")
log.LogErrorf(err.Error())
return
}
}
func (s *DataNode) checkVolumeForbidden(volNames []string) {
s.space.RangePartitions(func(partition *DataPartition) bool {
for _, volName := range volNames {
if volName == partition.volumeID {
partition.SetForbidden(true)
return true
}
}
partition.SetForbidden(false)
return true
})
}
func (s *DataNode) checkDecommissionDisks(decommissionDisks []string) {
decommissionDiskSet := util.NewSet()
for _, disk := range decommissionDisks {
decommissionDiskSet.Add(disk)
}
disks := s.space.GetDisks()
for _, disk := range disks {
if disk.GetDecommissionStatus() && !decommissionDiskSet.Has(disk.Path) {
log.LogDebugf("action[checkDecommissionDisks] mark %v to be undecommissioned", disk.Path)
disk.MarkDecommissionStatus(false)
continue
}
if !disk.GetDecommissionStatus() && decommissionDiskSet.Has(disk.Path) {
log.LogDebugf("action[checkDecommissionDisks] mark %v to be decommissioned", disk.Path)
disk.MarkDecommissionStatus(true)
continue
}
}
}
// Handle OpHeartbeat packet.
func (s *DataNode) handleHeartbeatPacket(p *repl.Packet) {
var err error
task := &proto.AdminTask{}
err = json.Unmarshal(p.Data, task)
defer func() {
if err != nil {
p.PackErrorBody(ActionCreateDataPartition, err.Error())
} else {
p.PacketOkReply()
}
}()
if err != nil {
return
}
go func() {
request := &proto.HeartBeatRequest{}
response := &proto.DataNodeHeartbeatResponse{}
s.buildHeartBeatResponse(response)
if task.OpCode == proto.OpDataNodeHeartbeat {
marshaled, _ := json.Marshal(task.Request)
_ = json.Unmarshal(marshaled, request)
response.Status = proto.TaskSucceeds
if s.diskQosEnableFromMaster != request.EnableDiskQos {
log.LogWarnf("action[handleHeartbeatPacket] master command disk qos enable change to [%v], local conf enable [%v]",
request.EnableDiskQos,
s.diskQosEnable)
}
// set volume forbidden
s.checkVolumeForbidden(request.ForbiddenVols)
// set decommission disks
s.checkDecommissionDisks(request.DecommissionDisks)
s.diskQosEnableFromMaster = request.EnableDiskQos
var needUpdate bool
for _, pair := range []struct {
replace uint64
origin *int
}{
{request.QosFlowWriteLimit, &s.diskWriteFlow},
{request.QosFlowReadLimit, &s.diskReadFlow},
{request.QosIopsWriteLimit, &s.diskWriteIops},
{request.QosIopsReadLimit, &s.diskReadIops},
} {
if pair.replace > 0 && int(pair.replace) != *pair.origin {
*pair.origin = int(pair.replace)
needUpdate = true
}
}
// set cpu util and io used in here
response.CpuUtil = s.cpuUtil.Load()
response.IoUtils = s.space.GetDiskUtils()
if needUpdate {
log.LogWarnf("action[handleHeartbeatPacket] master change disk qos limit to [flowWrite %v, flowRead %v, iopsWrite %v, iopsRead %v]",
s.diskWriteFlow, s.diskReadFlow, s.diskWriteIops, s.diskReadIops)
s.updateQosLimit()
}
} else {
response.Status = proto.TaskFailed
err = fmt.Errorf("illegal opcode")
response.Result = err.Error()
}
task.Response = response
if err = MasterClient.NodeAPI().ResponseDataNodeTask(task); err != nil {
err = errors.Trace(err, "heartbeat to master(%v) failed.", request.MasterAddr)
log.LogErrorf(err.Error())
return
}
}()
}
// Handle OpDeleteDataPartition packet.
func (s *DataNode) handlePacketToDeleteDataPartition(p *repl.Packet) {
task := &proto.AdminTask{}
err := json.Unmarshal(p.Data, task)
defer func() {
if err != nil {
p.PackErrorBody(ActionDeleteDataPartition, err.Error())
} else {
p.PacketOkReply()
}
}()
if err != nil {
return
}
request := &proto.DeleteDataPartitionRequest{}
if task.OpCode == proto.OpDeleteDataPartition {
bytes, _ := json.Marshal(task.Request)
p.AddMesgLog(string(bytes))
err = json.Unmarshal(bytes, request)
if err != nil {
return
} else {
s.space.DeletePartition(request.PartitionId)
}
} else {
err = fmt.Errorf("illegal opcode ")
}
if err != nil {
err = errors.Trace(err, "delete DataPartition failed,PartitionID(%v)", request.PartitionId)
log.LogErrorf("action[handlePacketToDeleteDataPartition] err(%v).", err)
}
log.LogInfof(fmt.Sprintf("action[handlePacketToDeleteDataPartition] %v error(%v)", request.PartitionId, err))
}
// Handle OpLoadDataPartition packet.
func (s *DataNode) handlePacketToLoadDataPartition(p *repl.Packet) {
task := &proto.AdminTask{}
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionLoadDataPartition, err.Error())
} else {
p.PacketOkReply()
}
}()
err = json.Unmarshal(p.Data, task)
p.PacketOkReply()
go s.asyncLoadDataPartition(task)
}
func (s *DataNode) asyncLoadDataPartition(task *proto.AdminTask) {
var err error
request := &proto.LoadDataPartitionRequest{}
response := &proto.LoadDataPartitionResponse{}
if task.OpCode == proto.OpLoadDataPartition {
bytes, _ := json.Marshal(task.Request)
json.Unmarshal(bytes, request)
dp := s.space.Partition(request.PartitionId)
if dp == nil {
response.Status = proto.TaskFailed
response.PartitionId = uint64(request.PartitionId)
err = fmt.Errorf(fmt.Sprintf("DataPartition(%v) not found", request.PartitionId))
response.Result = err.Error()
} else {
response = dp.Load()
response.PartitionId = uint64(request.PartitionId)
response.Status = proto.TaskSucceeds
}
} else {
response.PartitionId = uint64(request.PartitionId)
response.Status = proto.TaskFailed
err = fmt.Errorf("illegal opcode")
response.Result = err.Error()
}
task.Response = response
if err = MasterClient.NodeAPI().ResponseDataNodeTask(task); err != nil {
err = errors.Trace(err, "load DataPartition failed,PartitionID(%v)", request.PartitionId)
log.LogError(errors.Stack(err))
}
}
// Handle OpMarkDelete packet.
func (s *DataNode) handleMarkDeletePacket(p *repl.Packet, c net.Conn) {
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionBatchMarkDelete, err.Error())
} else {
p.PacketOkReply()
}
}()
partition := p.Object.(*DataPartition)
// NOTE: we cannot prevent mark delete
// even the partition is forbidden, because
// the inode already be deleted in meta partition
// if we prevent it, we will get "orphan extents"
if proto.IsTinyExtentType(p.ExtentType) || p.Opcode == proto.OpSplitMarkDelete {
ext := new(proto.TinyExtentDeleteRecord)
err = json.Unmarshal(p.Data, ext)
if err == nil {
log.LogInfof("handleMarkDeletePacket Delete PartitionID(%v)_Extent(%v)_Offset(%v)_Size(%v)",
p.PartitionID, p.ExtentID, ext.ExtentOffset, ext.Size)
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
partition.disk.limitWrite.Run(0, func() {
err = partition.ExtentStore().MarkDelete(p.ExtentID, int64(ext.ExtentOffset), int64(ext.Size))
if err != nil {
log.LogErrorf("action[handleMarkDeletePacket]: failed to mark delete extent(%v), %v", p.ExtentID, err)
}
})
}
} else {
log.LogInfof("handleMarkDeletePacket Delete PartitionID(%v)_Extent(%v)",
p.PartitionID, p.ExtentID)
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
partition.disk.limitWrite.Run(0, func() {
err = partition.ExtentStore().MarkDelete(p.ExtentID, 0, 0)
if err != nil {
log.LogErrorf("action[handleMarkDeletePacket]: failed to mark delete extent(%v), %v", p.ExtentID, err)
}
})
}
}
// Handle OpMarkDelete packet.
func (s *DataNode) handleBatchMarkDeletePacket(p *repl.Packet, c net.Conn) {
var err error
defer func() {
if err != nil {
log.LogErrorf(fmt.Sprintf("(%v) error(%v).", p.GetUniqueLogId(), err))
p.PackErrorBody(ActionBatchMarkDelete, err.Error())
} else {
p.PacketOkReply()
}
}()
partition := p.Object.(*DataPartition)
// NOTE: we cannot prevent mark delete
// even the partition is forbidden, because
// the inode already be deleted in meta partition
// if we prevent it, we will get "orphan extents"
var exts []*proto.ExtentKey
err = json.Unmarshal(p.Data, &exts)
store := partition.ExtentStore()
if err == nil {
for _, ext := range exts {
if deleteLimiteRater.Allow() {
log.LogInfof(fmt.Sprintf("recive DeleteExtent (%v) from (%v)", ext, c.RemoteAddr().String()))
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
partition.disk.limitWrite.Run(0, func() {
err = store.MarkDelete(ext.ExtentId, int64(ext.ExtentOffset), int64(ext.Size))
if err != nil {
log.LogErrorf("action[handleBatchMarkDeletePacket]: failed to mark delete extent(%v), %v", p.ExtentID, err)
}
})
if err != nil {
return
}
} else {
log.LogInfof("delete limiter reach(%v), remote (%v) try again.", deleteLimiteRater.Limit(), c.RemoteAddr().String())
err = storage.TryAgainError
}
}
}
}
// Handle OpWrite packet.
func (s *DataNode) handleWritePacket(p *repl.Packet) {
var (
err error
metricPartitionIOLabels map[string]string
partitionIOMetric *exporter.TimePointCount
)
defer func() {
if err != nil {
p.PackErrorBody(ActionWrite, err.Error())
} else {
p.PacketOkReply()
}
}()
partition := p.Object.(*DataPartition)
if partition.IsForbidden() {
err = ErrForbiddenDataPartition
return
}
shallDegrade := p.ShallDegrade()
if !shallDegrade {
metricPartitionIOLabels = GetIoMetricLabels(partition, "write")
}
if partition.Available() <= 0 || !partition.disk.CanWrite() {
err = storage.NoSpaceError
return
} else if partition.disk.Status == proto.Unavailable {
err = storage.BrokenDiskError
return
}
store := partition.ExtentStore()
if proto.IsTinyExtentType(p.ExtentType) {
if !shallDegrade {
partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
}
partition.disk.allocCheckLimit(proto.FlowWriteType, uint32(p.Size))
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
if writable := partition.disk.limitWrite.TryRun(int(p.Size), func() {
_, err = store.Write(p.ExtentID, p.ExtentOffset, int64(p.Size), p.Data, p.CRC, storage.AppendWriteType, p.IsSyncWrite())
}); !writable {
err = storage.TryAgainError
return
}
if !shallDegrade {
s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
}
partition.checkIsDiskError(err, WriteFlag)
return
}
if p.Size <= util.BlockSize {
if !shallDegrade {
partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
}
partition.disk.allocCheckLimit(proto.FlowWriteType, uint32(p.Size))
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
if writable := partition.disk.limitWrite.TryRun(int(p.Size), func() {
_, err = store.Write(p.ExtentID, p.ExtentOffset, int64(p.Size), p.Data, p.CRC, storage.AppendWriteType, p.IsSyncWrite())
}); !writable {
err = storage.TryAgainError
return
}
if !shallDegrade {
s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
}
partition.checkIsDiskError(err, WriteFlag)
} else {
size := p.Size
offset := 0
for size > 0 {
if size <= 0 {
break
}
currSize := util.Min(int(size), util.BlockSize)
data := p.Data[offset : offset+currSize]
crc := crc32.ChecksumIEEE(data)
if !shallDegrade {
partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
}
partition.disk.allocCheckLimit(proto.FlowWriteType, uint32(currSize))
partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
if writable := partition.disk.limitWrite.TryRun(currSize, func() {
_, err = store.Write(p.ExtentID, p.ExtentOffset+int64(offset), int64(currSize), data, crc, storage.AppendWriteType, p.IsSyncWrite())
}); !writable {
err = storage.TryAgainError
return
}
if !shallDegrade {
s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
}
partition.checkIsDiskError(err, WriteFlag)
if err != nil {
break
}
size -= uint32(currSize)
offset += currSize
}
}
}
func (s *DataNode) handleRandomWritePacket(p *repl.Packet) {
var (
err error
metricPartitionIOLabels map[string]string
partitionIOMetric *exporter.TimePointCount
)
defer func() {
log.LogDebugf("action[handleRandomWritePacket opcod %v seq %v dpid %v resultCode %v extid %v err %v",
p.Opcode, p.VerSeq, p.PartitionID, p.ResultCode, p.ExtentID, err)
if err != nil {
p.PackErrorBody(ActionWrite, err.Error())
} else {
// avoid rsp pack ver info into package which client need do more work to read buffer
if p.Opcode == proto.OpRandomWriteVer || p.Opcode == proto.OpSyncRandomWriteVer {
p.Opcode = proto.OpSyncRandomWriteVerRsp
}
if p.Opcode == proto.OpTryWriteAppend && p.ResultCode == proto.OpTryOtherExtent {
p.PackErrorBody(ActionWrite, storage.SnapshotNeedNewExtentError.Error())
p.ResultCode = proto.OpTryOtherExtent
log.LogDebugf("action[handleRandomWritePacket opcod %v seq %v dpid %v resultCode %v extid %v", p.Opcode, p.VerSeq, p.PartitionID, p.ResultCode, p.ExtentID)
return
}
p.PacketOkReply()
}
}()
partition := p.Object.(*DataPartition)
if partition.IsForbidden() {
err = ErrForbiddenDataPartition
return
}
log.LogDebugf("action[handleRandomWritePacket opcod %v seq %v dpid %v dpseq %v extid %v", p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, p.ExtentID)
// cache or preload partition not support raft and repair.
if !partition.isNormalType() {
err = raft.ErrStopped
return
}
_, isLeader := partition.IsRaftLeader()
if !isLeader {
err = raft.ErrNotLeader
return
}
shallDegrade := p.ShallDegrade()
if !shallDegrade {
metricPartitionIOLabels = GetIoMetricLabels(partition, "randwrite")
partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
}
err = partition.RandomWriteSubmit(p)
if !shallDegrade {
s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
}
if err != nil && strings.Contains(err.Error(), raft.ErrNotLeader.Error()) {
err = raft.ErrNotLeader
log.LogErrorf("action[handleRandomWritePacket] opcod %v seq %v dpid %v dpseq %v extid %v err %v", p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, p.ExtentID, err)
return
}
if err == nil && p.ResultCode != proto.OpOk && p.ResultCode != proto.OpTryOtherExtent {
log.LogErrorf("action[handleRandomWritePacket] opcod %v seq %v dpid %v dpseq %v extid %v ResultCode %v",
p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, p.ExtentID, p.ResultCode)
err = storage.TryAgainError
return
}
log.LogDebugf("action[handleRandomWritePacket] opcod %v seq %v dpid %v dpseq %v after raft submit err %v resultCode %v",
p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, err, p.ResultCode)
}
func (s *DataNode) handleStreamReadPacket(p *repl.Packet, connect net.Conn, isRepairRead bool) {
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionStreamRead, err.Error())
p.WriteToConn(connect)
}
}()
partition := p.Object.(*DataPartition)
// cache or preload partition not support raft and repair.
if !partition.isNormalType() {
err = raft.ErrStopped
return
}
if err = partition.CheckLeader(p, connect); err != nil {
return
}
s.extentRepairReadPacket(p, connect, isRepairRead)
}
func (s *DataNode) handleExtentRepairReadPacket(p *repl.Packet, connect net.Conn, isRepairRead bool) {
var err error
log.LogDebugf("handleExtentRepairReadPacket %v", p)
defer func() {
if err != nil {
p.PackErrorBody(ActionStreamRead, err.Error())
p.WriteToConn(connect)
return
}
fininshDoExtentRepair()
}()
err = requestDoExtentRepair()
if err != nil {
return
}
s.extentRepairReadPacket(p, connect, isRepairRead)
}
func (s *DataNode) handleTinyExtentRepairReadPacket(p *repl.Packet, connect net.Conn) {
s.tinyExtentRepairRead(p, connect)
}
func (s *DataNode) extentRepairReadPacket(p *repl.Packet, connect net.Conn, isRepairRead bool) {
var (
err error
metricPartitionIOLabels map[string]string
partitionIOMetric, tpObject *exporter.TimePointCount
)
defer func() {
if err != nil {
p.PackErrorBody(ActionStreamRead, err.Error())
p.WriteToConn(connect)
}
}()
partition := p.Object.(*DataPartition)
needReplySize := p.Size
offset := p.ExtentOffset
store := partition.ExtentStore()
shallDegrade := p.ShallDegrade()
if !shallDegrade {
metricPartitionIOLabels = GetIoMetricLabels(partition, "read")
}
log.LogDebugf("extentRepairReadPacket dp %v offset %v needSize %v", partition.partitionID, offset, needReplySize)
for {
if needReplySize <= 0 {
break
}
err = nil
reply := repl.NewStreamReadResponsePacket(p.ReqID, p.PartitionID, p.ExtentID)
reply.StartT = p.StartT
currReadSize := uint32(util.Min(int(needReplySize), util.ReadBlockSize))
if currReadSize == util.ReadBlockSize {
reply.Data, _ = proto.Buffers.Get(util.ReadBlockSize)
} else {
reply.Data = make([]byte, currReadSize)
}
if !shallDegrade {
partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
tpObject = exporter.NewTPCnt(fmt.Sprintf("Repair_%s", p.GetOpMsg()))
}
reply.ExtentOffset = offset
p.Size = currReadSize
p.ExtentOffset = offset
partition.Disk().allocCheckLimit(proto.IopsReadType, 1)
partition.Disk().allocCheckLimit(proto.FlowReadType, currReadSize)
partition.disk.limitRead.Run(int(currReadSize), func() {
reply.CRC, err = store.Read(reply.ExtentID, offset, int64(currReadSize), reply.Data, isRepairRead)
})
if !shallDegrade {
s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
tpObject.Set(err)
}
partition.checkIsDiskError(err, ReadFlag)
p.CRC = reply.CRC
if err != nil {
log.LogErrorf("action[operatePacket] err %v", err)
return
}
reply.Size = currReadSize
reply.ResultCode = proto.OpOk
reply.Opcode = p.Opcode
p.ResultCode = proto.OpOk
if err = reply.WriteToConn(connect); err != nil {
return
}
needReplySize -= currReadSize
offset += int64(currReadSize)
if currReadSize == util.ReadBlockSize {
proto.Buffers.Put(reply.Data)
}
logContent := fmt.Sprintf("action[operatePacket] %v.",
reply.LogMessage(reply.GetOpMsg(), connect.RemoteAddr().String(), reply.StartT, err))
log.LogReadf(logContent)
}
p.PacketOkReply()
}
func (s *DataNode) handlePacketToGetAllWatermarks(p *repl.Packet) {
var (
buf []byte
fInfoList []*storage.ExtentInfo
err error
)
partition := p.Object.(*DataPartition)
store := partition.ExtentStore()
if proto.IsNormalExtentType(p.ExtentType) {
fInfoList, _, err = store.GetAllWatermarks(storage.NormalExtentFilter())
} else {
extents := make([]uint64, 0)
err = json.Unmarshal(p.Data, &extents)
if err == nil {
fInfoList, _, err = store.GetAllWatermarks(storage.TinyExtentFilter(extents))
}
}
if err != nil {
p.PackErrorBody(ActionGetAllExtentWatermarks, err.Error())
} else {
buf, err = json.Marshal(fInfoList)
if err != nil {
p.PackErrorBody(ActionGetAllExtentWatermarks, err.Error())
} else {
p.PacketOkWithByte(buf)
}
}
}
func (s *DataNode) writeEmptyPacketOnTinyExtentRepairRead(reply *repl.Packet, newOffset, currentOffset int64, connect net.Conn) (replySize int64, err error) {
replySize = newOffset - currentOffset
reply.Data = make([]byte, 0)
reply.Size = 0
reply.CRC = crc32.ChecksumIEEE(reply.Data)
reply.ResultCode = proto.OpOk
reply.ExtentOffset = currentOffset
reply.Arg[0] = EmptyResponse
binary.BigEndian.PutUint64(reply.Arg[1:9], uint64(replySize))
err = reply.WriteToConn(connect)
reply.Size = uint32(replySize)
logContent := fmt.Sprintf("action[operatePacket] %v.",
reply.LogMessage(reply.GetOpMsg(), connect.RemoteAddr().String(), reply.StartT, err))
log.LogReadf(logContent)
return
}
func (s *DataNode) attachAvaliSizeOnTinyExtentRepairRead(reply *repl.Packet, avaliSize uint64) {
binary.BigEndian.PutUint64(reply.Arg[9:17], avaliSize)
}
// Handle tinyExtentRepairRead packet.
func (s *DataNode) tinyExtentRepairRead(request *repl.Packet, connect net.Conn) {
var (
err error
needReplySize int64
tinyExtentFinfoSize uint64
)
defer func() {
if err != nil {
request.PackErrorBody(ActionStreamReadTinyExtentRepair, err.Error())
request.WriteToConn(connect)
}
}()
if !storage.IsTinyExtent(request.ExtentID) {
err = fmt.Errorf("unavali extentID (%v)", request.ExtentID)
return
}
partition := request.Object.(*DataPartition)
store := partition.ExtentStore()
tinyExtentFinfoSize, err = store.TinyExtentGetFinfoSize(request.ExtentID)
if err != nil {
return
}
needReplySize = int64(request.Size)
offset := request.ExtentOffset
if uint64(request.ExtentOffset)+uint64(request.Size) > tinyExtentFinfoSize {
needReplySize = int64(tinyExtentFinfoSize - uint64(request.ExtentOffset))
}
avaliReplySize := uint64(needReplySize)
var newOffset, newEnd int64
for {
if needReplySize <= 0 {
break
}
reply := repl.NewTinyExtentStreamReadResponsePacket(request.ReqID, request.PartitionID, request.ExtentID)
reply.ArgLen = TinyExtentRepairReadResponseArgLen
reply.Arg = make([]byte, TinyExtentRepairReadResponseArgLen)
s.attachAvaliSizeOnTinyExtentRepairRead(reply, avaliReplySize)
newOffset, newEnd, err = store.TinyExtentAvaliOffset(request.ExtentID, offset)
if err != nil {
return
}
if newOffset > offset {
var replySize int64
if replySize, err = s.writeEmptyPacketOnTinyExtentRepairRead(reply, newOffset, offset, connect); err != nil {
return
}
needReplySize -= replySize
offset += replySize
continue
}
currNeedReplySize := newEnd - newOffset
currReadSize := uint32(util.Min(int(currNeedReplySize), util.ReadBlockSize))
if currReadSize == util.ReadBlockSize {
reply.Data, _ = proto.Buffers.Get(util.ReadBlockSize)
} else {
reply.Data = make([]byte, currReadSize)
}
reply.ExtentOffset = offset
reply.CRC, err = store.Read(reply.ExtentID, offset, int64(currReadSize), reply.Data, false)
if err != nil {
return
}
reply.Size = uint32(currReadSize)
reply.ResultCode = proto.OpOk
if err = reply.WriteToConn(connect); err != nil {
connect.Close()
return
}
needReplySize -= int64(currReadSize)
offset += int64(currReadSize)
if currReadSize == util.ReadBlockSize {
proto.Buffers.Put(reply.Data)
}
logContent := fmt.Sprintf("action[operatePacket] %v.",
reply.LogMessage(reply.GetOpMsg(), connect.RemoteAddr().String(), reply.StartT, err))
log.LogReadf(logContent)
}
request.PacketOkReply()
}
func (s *DataNode) handlePacketToReadTinyDeleteRecordFile(p *repl.Packet, connect net.Conn) {
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionStreamReadTinyDeleteRecord, err.Error())
p.WriteToConn(connect)
}
}()
partition := p.Object.(*DataPartition)
store := partition.ExtentStore()
localTinyDeleteFileSize, err := store.LoadTinyDeleteFileOffset()
if err != nil {
return
}
needReplySize := localTinyDeleteFileSize - p.ExtentOffset
offset := p.ExtentOffset
reply := repl.NewReadTinyDeleteRecordResponsePacket(p.ReqID, p.PartitionID)
reply.StartT = time.Now().UnixNano()
for {
if needReplySize <= 0 {
break
}
err = nil
currReadSize := uint32(util.Min(int(needReplySize), MaxSyncTinyDeleteBufferSize))
reply.Data = make([]byte, currReadSize)
reply.ExtentOffset = offset
reply.CRC, err = store.ReadTinyDeleteRecords(offset, int64(currReadSize), reply.Data)
if err != nil {
err = fmt.Errorf(ActionStreamReadTinyDeleteRecord+" localTinyDeleteRecordSize(%v) offset(%v)"+
" currReadSize(%v) err(%v)", localTinyDeleteFileSize, offset, currReadSize, err)
return
}
reply.Size = uint32(currReadSize)
reply.ResultCode = proto.OpOk
if err = reply.WriteToConn(connect); err != nil {
return
}
needReplySize -= int64(currReadSize)
offset += int64(currReadSize)
}
p.PacketOkReply()
}
// Handle OpNotifyReplicasToRepair packet.
func (s *DataNode) handlePacketToNotifyExtentRepair(p *repl.Packet) {
var err error
partition := p.Object.(*DataPartition)
mf := new(DataPartitionRepairTask)
err = json.Unmarshal(p.Data, mf)
if err != nil {
p.PackErrorBody(ActionRepair, err.Error())
return
}
partition.DoExtentStoreRepair(mf)
p.PacketOkReply()
}
// Handle OpBroadcastMinAppliedID
func (s *DataNode) handleBroadcastMinAppliedID(p *repl.Packet) {
partition := p.Object.(*DataPartition)
minAppliedID := binary.BigEndian.Uint64(p.Data)
if minAppliedID > 0 {
partition.SetMinAppliedID(minAppliedID)
}
log.LogDebugf("[handleBroadcastMinAppliedID] partition(%v) minAppliedID(%v)", partition.partitionID, minAppliedID)
p.PacketOkReply()
}
// Handle handlePacketToGetAppliedID packet.
func (s *DataNode) handlePacketToGetAppliedID(p *repl.Packet) {
partition := p.Object.(*DataPartition)
appliedID := partition.GetAppliedID()
buf := make([]byte, 8)
binary.BigEndian.PutUint64(buf, appliedID)
p.PacketOkWithBody(buf)
p.AddMesgLog(fmt.Sprintf("_AppliedID(%v)", appliedID))
}
func (s *DataNode) handlePacketToGetPartitionSize(p *repl.Packet) {
partition := p.Object.(*DataPartition)
usedSize := partition.extentStore.StoreSizeExtentID(p.ExtentID)
buf := make([]byte, 8)
binary.BigEndian.PutUint64(buf, uint64(usedSize))
p.AddMesgLog(fmt.Sprintf("partitionSize_(%v)", usedSize))
p.PacketOkWithBody(buf)
}
func (s *DataNode) handlePacketToGetMaxExtentIDAndPartitionSize(p *repl.Packet) {
partition := p.Object.(*DataPartition)
maxExtentID, totalPartitionSize := partition.extentStore.GetMaxExtentIDAndPartitionSize()
buf := make([]byte, 16)
binary.BigEndian.PutUint64(buf[0:8], uint64(maxExtentID))
binary.BigEndian.PutUint64(buf[8:16], totalPartitionSize)
p.PacketOkWithBody(buf)
}
func (s *DataNode) handlePacketToDecommissionDataPartition(p *repl.Packet) {
var (
err error
reqData []byte
isRaftLeader bool
req = &proto.DataPartitionDecommissionRequest{}
)
defer func() {
if err != nil {
p.PackErrorBody(ActionDecommissionPartition, err.Error())
} else {
p.PacketOkReply()
}
}()
adminTask := &proto.AdminTask{}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
return
}
reqData, err = json.Marshal(adminTask.Request)
if err != nil {
return
}
if err = json.Unmarshal(reqData, req); err != nil {
return
}
p.AddMesgLog(string(reqData))
dp := s.space.Partition(req.PartitionId)
if dp == nil {
err = fmt.Errorf("partition %v not exsit", req.PartitionId)
return
}
p.PartitionID = req.PartitionId
isRaftLeader, err = s.forwardToRaftLeader(dp, p, false)
if !isRaftLeader {
err = raft.ErrNotLeader
return
}
if req.AddPeer.ID == req.RemovePeer.ID {
err = errors.NewErrorf("[opOfflineDataPartition]: AddPeer(%v) same withRemovePeer(%v)", req.AddPeer, req.RemovePeer)
return
}
if req.AddPeer.ID != 0 {
_, err = dp.ChangeRaftMember(raftProto.ConfAddNode, raftProto.Peer{ID: req.AddPeer.ID}, reqData)
if err != nil {
return
}
}
_, err = dp.ChangeRaftMember(raftProto.ConfRemoveNode, raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
if err != nil {
return
}
}
func (s *DataNode) handlePacketToAddDataPartitionRaftMember(p *repl.Packet) {
var (
err error
reqData []byte
isRaftLeader bool
req = &proto.AddDataPartitionRaftMemberRequest{}
)
defer func() {
if err != nil {
p.PackErrorBody(ActionAddDataPartitionRaftMember, err.Error())
} else {
p.PacketOkReply()
}
}()
adminTask := &proto.AdminTask{}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
return
}
reqData, err = json.Marshal(adminTask.Request)
if err != nil {
return
}
if err = json.Unmarshal(reqData, req); err != nil {
return
}
log.LogInfof("action[handlePacketToAddDataPartitionRaftMember] %v, partition id %v", req.AddPeer, req.PartitionId)
p.AddMesgLog(string(reqData))
dp := s.space.Partition(req.PartitionId)
if dp == nil {
err = proto.ErrDataPartitionNotExists
return
}
p.PartitionID = req.PartitionId
if dp.IsExistReplica(req.AddPeer.Addr) {
log.LogInfof("handlePacketToAddDataPartitionRaftMember recive MasterCommand: %v "+
"addRaftAddr(%v) has exsit", string(reqData), req.AddPeer.Addr)
return
}
isRaftLeader, err = s.forwardToRaftLeader(dp, p, false)
if !isRaftLeader {
return
}
log.LogInfof("action[handlePacketToAddDataPartitionRaftMember] before ChangeRaftMember %v which is sync. partition id %v", req.AddPeer, req.PartitionId)
if req.AddPeer.ID != 0 {
_, err = dp.ChangeRaftMember(raftProto.ConfAddNode, raftProto.Peer{ID: req.AddPeer.ID}, reqData)
if err != nil {
return
}
}
log.LogInfof("action[handlePacketToAddDataPartitionRaftMember] after ChangeRaftMember %v, partition id %v", req.AddPeer, &req.PartitionId)
}
func (s *DataNode) handlePacketToRemoveDataPartitionRaftMember(p *repl.Packet) {
var (
err error
reqData []byte
isRaftLeader bool
req = &proto.RemoveDataPartitionRaftMemberRequest{}
)
defer func() {
if err != nil {
p.PackErrorBody(ActionRemoveDataPartitionRaftMember, err.Error())
} else {
p.PacketOkReply()
}
}()
adminTask := &proto.AdminTask{}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
return
}
reqData, err = json.Marshal(adminTask.Request)
p.AddMesgLog(string(reqData))
if err != nil {
return
}
if err = json.Unmarshal(reqData, req); err != nil {
return
}
dp := s.space.Partition(req.PartitionId)
if dp == nil {
return
}
log.LogDebugf("action[handlePacketToRemoveDataPartitionRaftMember], req %v (%s) RemoveRaftPeer(%s) dp %v replicaNum %v",
p.GetReqID(), string(reqData), req.RemovePeer.Addr, dp.partitionID, dp.replicaNum)
p.PartitionID = req.PartitionId
if !dp.IsExistReplica(req.RemovePeer.Addr) {
log.LogWarnf("action[handlePacketToRemoveDataPartitionRaftMember] receive MasterCommand: req %v[%v] "+
"RemoveRaftPeer(%v) has not exist", p.GetReqID(), string(reqData), req.RemovePeer.Addr)
return
}
isRaftLeader, err = s.forwardToRaftLeader(dp, p, req.Force)
if !isRaftLeader {
log.LogWarnf("handlePacketToRemoveDataPartitionRaftMember return no leader")
return
}
if err = dp.CanRemoveRaftMember(req.RemovePeer, req.Force); err != nil {
log.LogWarnf("action[handlePacketToRemoveDataPartitionRaftMember] CanRemoveRaftMember failed "+
"req %v dp %v err %v",
p.GetReqID(), dp.partitionID, err.Error())
return
}
if req.Force {
cc := &raftProto.ConfChange{
Type: raftProto.ConfRemoveNode,
Peer: raftProto.Peer{
ID: req.RemovePeer.ID,
},
Context: reqData,
}
s.raftStore.RaftServer().RemoveRaftForce(dp.partitionID, cc)
dp.ApplyMemberChange(cc, 0)
dp.PersistMetadata()
return
}
if req.RemovePeer.ID != 0 {
log.LogDebugf("action[handlePacketToRemoveDataPartitionRaftMember] ChangeRaftMember "+
"req %v dp %v RemovePeer.ID %v", p.GetReqID(), dp.partitionID, req.RemovePeer.ID)
_, err = dp.ChangeRaftMember(raftProto.ConfRemoveNode, raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
if err != nil {
return
}
}
log.LogDebugf("action[handlePacketToRemoveDataPartitionRaftMember] CanRemoveRaftMember complete "+
"req %v dp %v ", p.GetReqID(), dp.partitionID)
}
func (s *DataNode) handlePacketToDataPartitionTryToLeader(p *repl.Packet) {
var err error
defer func() {
if err != nil {
p.PackErrorBody(ActionDataPartitionTryToLeader, err.Error())
log.LogWarnf("handlePacketToDataPartitionTryToLeader: %v ", err.Error())
} else {
p.PacketOkReply()
log.LogDebugf("handlePacketToDataPartitionTryToLeader: partition %v success ", p.PartitionID)
}
}()
log.LogDebugf("handlePacketToDataPartitionTryToLeader: partition %v ", p.PartitionID)
dp := s.space.Partition(p.PartitionID)
if dp == nil {
err = fmt.Errorf("partition %v not exsit", p.PartitionID)
return
}
if dp.raftStatus != RaftStatusRunning {
err = fmt.Errorf("partition %v raft not running", p.PartitionID)
return
}
if dp.raftPartition.IsRaftLeader() {
log.LogWarnf("handlePacketToDataPartitionTryToLeader: %v is already leader", p.PartitionID)
return
}
err = dp.raftPartition.TryToLeader(dp.partitionID)
}
func (s *DataNode) forwardToRaftLeader(dp *DataPartition, p *repl.Packet, force bool) (ok bool, err error) {
var (
conn *net.TCPConn
leaderAddr string
)
if leaderAddr, ok = dp.IsRaftLeader(); ok {
return
}
// return NoLeaderError if leaderAddr is nil
if leaderAddr == "" {
if force {
ok = true
log.LogInfof("action[forwardToRaftLeader] no leader but replica num %v continue", dp.replicaNum)
return
}
err = storage.NoLeaderError
return
}
// forward the packet to the leader if local one is not the leader
conn, err = gConnPool.GetConnect(leaderAddr)
if err != nil {
return
}
defer func() {
gConnPool.PutConnect(conn, err != nil)
}()
err = p.WriteToConn(conn)
if err != nil {
return
}
if err = p.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
return
}
return
}
func (s *DataNode) handlePacketToStopDataPartitionRepair(p *repl.Packet) {
task := &proto.AdminTask{}
err := json.Unmarshal(p.Data, task)
defer func() {
if err != nil {
p.PackErrorBody(ActionStopDataPartitionRepair, err.Error())
} else {
p.PacketOkReply()
}
}()
if err != nil {
return
}
request := &proto.StopDataPartitionRepairRequest{}
if task.OpCode != proto.OpStopDataPartitionRepair {
err = fmt.Errorf("action[handlePacketToStopDataPartitionRepair] illegal opcode ")
log.LogWarnf("action[handlePacketToStopDataPartitionRepair] illegal opcode ")
return
}
bytes, _ := json.Marshal(task.Request)
p.AddMesgLog(string(bytes))
err = json.Unmarshal(bytes, request)
if err != nil {
return
}
log.LogDebugf("action[handlePacketToStopDataPartitionRepair] try stop %v", request.PartitionId)
dp := s.space.Partition(request.PartitionId)
if dp == nil {
err = proto.ErrDataPartitionNotExists
log.LogWarnf("action[handlePacketToStopDataPartitionRepair] cannot find dp %v", request.PartitionId)
return
}
dp.StopDecommissionRecover(request.Stop)
log.LogInfof("action[handlePacketToStopDataPartitionRepair] %v stop %v success", request.PartitionId, request.Stop)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"sync/atomic"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/storage"
)
func (s *DataNode) Post(p *repl.Packet) error {
if p.IsMasterCommand() {
p.NeedReply = true
}
if p.IsReadOperation() && p.AfterPre {
p.NeedReply = false
}
s.cleanupPkt(p)
s.addMetrics(p)
return nil
}
func (s *DataNode) cleanupPkt(p *repl.Packet) {
if p.IsMasterCommand() {
return
}
if !p.IsLeaderPacket() {
return
}
s.releaseExtent(p)
}
func (s *DataNode) releaseExtent(p *repl.Packet) {
if p == nil || !storage.IsTinyExtent(p.ExtentID) || p.ExtentID <= 0 || atomic.LoadInt32(&p.IsReleased) == IsReleased {
return
}
if !proto.IsTinyExtentType(p.ExtentType) || !p.IsLeaderPacket() || !p.IsNormalWriteOperation() || !p.IsForwardPkt() {
return
}
if p.Object == nil {
return
}
partition := p.Object.(*DataPartition)
store := partition.ExtentStore()
if p.IsErrPacket() {
store.SendToBrokenTinyExtentC(p.ExtentID)
} else {
store.SendToAvailableTinyExtentC(p.ExtentID)
}
atomic.StoreInt32(&p.IsReleased, IsReleased)
}
func (s *DataNode) addMetrics(p *repl.Packet) {
if p.IsMasterCommand() || p.ShallDegrade() {
return
}
p.AfterTp()
if p.Object == nil {
return
}
partition := p.Object.(*DataPartition)
if partition == nil {
return
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package datanode
import (
"encoding/json"
"fmt"
"hash/crc32"
"sync/atomic"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/repl"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util/log"
)
func (s *DataNode) Prepare(p *repl.Packet) (err error) {
defer func() {
p.SetPacketHasPrepare()
if err != nil {
p.PackErrorBody(repl.ActionPreparePkt, err.Error())
} else {
p.AfterPre = true
}
}()
if p.IsMasterCommand() {
return
}
atomic.AddUint64(&s.metricsCnt, 1)
if !s.shallDegrade() {
p.BeforeTp(s.clusterID)
p.UnsetDegrade()
} else {
p.SetDegrade()
}
err = s.checkStoreMode(p)
if err != nil {
return
}
if err = s.checkCrc(p); err != nil {
return
}
if err = s.checkPartition(p); err != nil {
return
}
// For certain packet, we meed to add some additional extent information.
if err = s.checkPacketAndPrepare(p); err != nil {
return
}
return
}
func (s *DataNode) checkStoreMode(p *repl.Packet) (err error) {
if proto.IsTinyExtentType(p.ExtentType) || proto.IsNormalExtentType(p.ExtentType) {
return
}
log.LogErrorf("action[checkStoreMode] dp [%v] reqId [%v] extent type %v", p.PartitionID, p.ReqID, p.ExtentType)
return ErrIncorrectStoreType
}
func (s *DataNode) checkCrc(p *repl.Packet) (err error) {
if !p.IsNormalWriteOperation() {
return
}
crc := crc32.ChecksumIEEE(p.Data[:p.Size])
if crc != p.CRC {
return storage.CrcMismatchError
}
return
}
func (s *DataNode) checkPartition(p *repl.Packet) (err error) {
dp := s.space.Partition(p.PartitionID)
if dp == nil {
// err = proto.ErrDataPartitionNotExists
err = fmt.Errorf("data partition not exists %v", p.PartitionID)
return
}
p.Object = dp
if p.IsNormalWriteOperation() || p.IsCreateExtentOperation() {
if dp.Available() <= 0 {
err = storage.NoSpaceError
return
}
}
if p.IsNormalWriteOperation() || p.IsRandomWrite() {
dp.disk.allocCheckLimit(proto.FlowWriteType, uint32(p.Size))
dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
}
return
}
func (s *DataNode) checkPacketAndPrepare(p *repl.Packet) error {
partition := p.Object.(*DataPartition)
store := p.Object.(*DataPartition).ExtentStore()
var (
extentID uint64
err error
)
log.LogDebugf("action[prepare.checkPacketAndPrepare] pack opcode (%v) p.IsLeaderPacket(%v) p (%v)", p.Opcode, p.IsLeaderPacket(), p)
if p.IsRandomWrite() || p.IsSnapshotModWriteAppendOperation() || p.IsNormalWriteOperation() {
if err = partition.CheckWriteVer(p); err != nil {
return err
}
}
if p.IsLeaderPacket() && proto.IsTinyExtentType(p.ExtentType) && p.IsNormalWriteOperation() {
extentID, err = store.GetAvailableTinyExtent()
if err != nil {
return fmt.Errorf("checkPacketAndPrepare partition %v GetAvailableTinyExtent error %v", p.PartitionID, err.Error())
}
p.ExtentID = extentID
p.ExtentOffset, err = store.GetTinyExtentOffset(extentID)
if err != nil {
return fmt.Errorf("checkPacketAndPrepare partition %v %v GetTinyExtentOffset error %v", p.PartitionID, extentID, err.Error())
}
} else if p.IsSnapshotModWriteAppendOperation() {
if proto.IsTinyExtentType(p.ExtentType) {
extentID, err = store.GetAvailableTinyExtent()
if err != nil {
log.LogErrorf("err %v", err)
return fmt.Errorf("checkPacketAndPrepare partition %v GetAvailableTinyExtent error %v", p.PartitionID, err.Error())
}
p.ExtentID = extentID
p.ExtentOffset, err = store.GetTinyExtentOffset(p.ExtentID)
if err != nil {
err = fmt.Errorf("checkPacketAndPrepare partition %v %v GetTinyExtentOffset error %v", p.PartitionID, extentID, err.Error())
log.LogErrorf("err %v", err)
}
log.LogDebugf("action[prepare.checkPacketAndPrepare] dp %v append randomWrite p.ExtentOffset %v Kernel(file)Offset %v",
p.PartitionID, p.ExtentOffset, p.KernelOffset)
return err
}
p.ExtentOffset, err = store.GetExtentSnapshotModOffset(p.ExtentID, p.Size)
log.LogDebugf("action[prepare.checkPacketAndPrepare] pack (%v) partition %v %v", p, p.PartitionID, extentID)
if err != nil {
return fmt.Errorf("checkPacketAndPrepare partition %v %v GetSnapshotModExtentOffset error %v", p.PartitionID, extentID, err.Error())
}
} else if p.IsLeaderPacket() && p.IsCreateExtentOperation() {
if partition.isNormalType() && partition.GetExtentCount() >= storage.MaxExtentCount*3 {
return fmt.Errorf("checkPacketAndPrepare partition %v has reached maxExtentId", p.PartitionID)
}
p.ExtentID, err = store.NextExtentID()
if err != nil {
return fmt.Errorf("checkPacketAndPrepare partition %v allocCheckLimit NextExtentId error %v", p.PartitionID, err)
}
} else if p.IsLeaderPacket() &&
((p.IsMarkDeleteExtentOperation() && proto.IsTinyExtentType(p.ExtentType)) ||
(p.IsMarkSplitExtentOperation() && !proto.IsTinyExtentType(p.ExtentType))) {
log.LogDebugf("checkPacketAndPrepare. packet opCode %v p.ExtentType %v", p.Opcode, p.ExtentType)
record := new(proto.TinyExtentDeleteRecord)
if err := json.Unmarshal(p.Data[:p.Size], record); err != nil {
return fmt.Errorf("checkPacketAndPrepare failed %v", err.Error())
}
p.Data, _ = json.Marshal(record)
p.Size = uint32(len(p.Data))
}
if (p.IsCreateExtentOperation() || p.IsNormalWriteOperation()) && p.ExtentID == 0 {
return fmt.Errorf("checkPacketAndPrepare partition %v invalid extent id. ", p.PartitionID)
}
p.OrgBuffer = p.Data
return nil
}
package fuse
import (
"sync"
)
const (
NumOfBlockPool = 9
)
const (
BlockSize = 4096
)
const (
PoolSize4K = BlockSize * (1 << iota)
PoolSize8K
PoolSize16K
PoolSize32K
PoolSize64K
PoolSize128K
PoolSize256K
PoolSize512K
PoolSize1024K
)
const (
PoolSizeWithHeader4K = BlockSize*(1<<iota) + OutHeaderSize
PoolSizeWithHeader8K
PoolSizeWithHeader16K
PoolSizeWithHeader32K
PoolSizeWithHeader64K
PoolSizeWithHeader128K
PoolSizeWithHeader256K
PoolSizeWithHeader512K
PoolSizeWithHeader1024K
)
var ReadBlockPool = [NumOfBlockPool]*sync.Pool{}
func InitReadBlockPool() {
ReadBlockPool[0] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader4K)
}}
ReadBlockPool[1] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader8K)
}}
ReadBlockPool[2] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader16K)
}}
ReadBlockPool[3] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader32K)
}}
ReadBlockPool[4] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader64K)
}}
ReadBlockPool[5] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader128K)
}}
ReadBlockPool[6] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader256K)
}}
ReadBlockPool[7] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader512K)
}}
ReadBlockPool[8] = &sync.Pool{New: func() interface{} {
return make([]byte, PoolSizeWithHeader1024K)
}}
}
func GetBlockBuf(size int) []byte {
var data []byte
switch size {
case PoolSize4K:
data = ReadBlockPool[0].Get().([]byte)
case PoolSize8K:
data = ReadBlockPool[1].Get().([]byte)
case PoolSize16K:
data = ReadBlockPool[2].Get().([]byte)
case PoolSize32K:
data = ReadBlockPool[3].Get().([]byte)
case PoolSize64K:
data = ReadBlockPool[4].Get().([]byte)
case PoolSize128K:
data = ReadBlockPool[5].Get().([]byte)
case PoolSize256K:
data = ReadBlockPool[6].Get().([]byte)
case PoolSize512K:
data = ReadBlockPool[7].Get().([]byte)
case PoolSize1024K:
data = ReadBlockPool[8].Get().([]byte)
default:
data = make([]byte, OutHeaderSize+size)
}
return data
}
func PutBlockBuf(data []byte) {
switch len(data) {
case PoolSizeWithHeader4K:
ReadBlockPool[0].Put(data)
case PoolSizeWithHeader8K:
ReadBlockPool[1].Put(data)
case PoolSizeWithHeader16K:
ReadBlockPool[2].Put(data)
case PoolSizeWithHeader32K:
ReadBlockPool[3].Put(data)
case PoolSizeWithHeader64K:
ReadBlockPool[4].Put(data)
case PoolSizeWithHeader128K:
ReadBlockPool[5].Put(data)
case PoolSizeWithHeader256K:
ReadBlockPool[6].Put(data)
case PoolSizeWithHeader512K:
ReadBlockPool[7].Put(data)
case PoolSizeWithHeader1024K:
ReadBlockPool[8].Put(data)
default:
return
}
}
package fuse
import "unsafe"
// buffer provides a mechanism for constructing a message from
// multiple segments.
type buffer []byte
// alloc allocates size bytes and returns a pointer to the new
// segment.
func (w *buffer) alloc(size uintptr) unsafe.Pointer {
s := int(size)
if len(*w)+s > cap(*w) {
old := *w
*w = make([]byte, len(*w), 2*cap(*w)+s)
copy(*w, old)
}
l := len(*w)
*w = (*w)[:l+s]
return unsafe.Pointer(&(*w)[l])
}
// reset clears out the contents of the buffer.
func (w *buffer) reset() {
for i := range (*w)[:cap(*w)] {
(*w)[i] = 0
}
*w = (*w)[:0]
}
func newBuffer(extra uintptr) buffer {
const hdrSize = unsafe.Sizeof(outHeader{})
buf := make(buffer, hdrSize, hdrSize+extra)
return buf
}
package fuse
import (
"runtime"
)
func stack() string {
buf := make([]byte, 1024)
return string(buf[:runtime.Stack(buf, false)])
}
func nop(msg interface{}) {}
// Debug is called to output debug messages, including protocol
// traces. The default behavior is to do nothing.
//
// The messages have human-friendly string representations and are
// safe to marshal to JSON.
//
// Implementations must not retain msg.
var Debug func(msg interface{}) = nop
package fuse
import (
"syscall"
)
const (
ENODATA = Errno(syscall.ENODATA)
)
const (
errNoXattr = ENODATA
)
func init() {
errnoNames[errNoXattr] = "ENODATA"
}
// FUSE service loop, for servers that wish to use it.
package fs // import "github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"
import (
"encoding/binary"
"fmt"
"hash/fnv"
"io"
"log"
"net"
"os"
"reflect"
"runtime"
"strings"
"sync"
"time"
"unsafe"
"github.com/cubefs/cubefs/proto"
"bytes"
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
"github.com/cubefs/cubefs/depends/bazil.org/fuse/fuseutil"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/stat"
"golang.org/x/net/context"
"golang.org/x/time/rate"
)
const (
attrValidTime = 1 * time.Minute
entryValidTime = 1 * time.Minute
)
const (
defaultForgetServeLimit = rate.Limit(1 << 16)
defaultForgetServeBurst = 128
)
var ForgetServeLimit *rate.Limiter = rate.NewLimiter(defaultForgetServeLimit, defaultForgetServeBurst)
// TODO: FINISH DOCS
type FSStatType uint32
const (
FSStatResume FSStatType = iota
FSStatSuspend
FSStatShutdown
FSStatRestore
)
// An FS is the interface required of a file system.
//
// Other FUSE requests can be handled by implementing methods from the
// FS* interfaces, for example FSStatfser.
type FS interface {
// Root is called to obtain the Node for the file system root.
Root() (Node, error)
Node(ino, pino uint64, mode uint32) (Node, error)
State() (FSStatType, string)
Notify(stat FSStatType, msg interface{})
}
type FSStatfser interface {
// Statfs is called to obtain file system metadata.
// It should write that data to resp.
Statfs(ctx context.Context, req *fuse.StatfsRequest, resp *fuse.StatfsResponse) error
}
type FSDestroyer interface {
// Destroy is called when the file system is shutting down.
//
// Linux only sends this request for block device backed (fuseblk)
// filesystems, to allow them to flush writes to disk before the
// unmount completes.
Destroy()
}
type FSInodeGenerator interface {
// GenerateInode is called to pick a dynamic inode number when it
// would otherwise be 0.
//
// Not all filesystems bother tracking inodes, but FUSE requires
// the inode to be set, and fewer duplicates in general makes UNIX
// tools work better.
//
// Operations where the nodes may return 0 inodes include Getattr,
// Setattr and ReadDir.
//
// If FS does not implement FSInodeGenerator, GenerateDynamicInode
// is used.
//
// Implementing this is useful to e.g. constrain the range of
// inode values used for dynamic inodes.
GenerateInode(parentInode uint64, name string) uint64
}
// A Node is the interface required of a file or directory.
// See the documentation for type FS for general information
// pertaining to all methods.
//
// A Node must be usable as a map key, that is, it cannot be a
// function, map or slice.
//
// Other FUSE requests can be handled by implementing methods from the
// Node* interfaces, for example NodeOpener.
//
// Methods returning Node should take care to return the same Node
// when the result is logically the same instance. Without this, each
// Node will get a new NodeID, causing spurious cache invalidations,
// extra lookups and aliasing anomalies. This may not matter for a
// simple, read-only filesystem.
type Node interface {
// Attr fills attr with the standard metadata for the node.
//
// Fields with reasonable defaults are prepopulated. For example,
// all times are set to a fixed moment when the program started.
//
// If Inode is left as 0, a dynamic inode number is chosen.
//
// The result may be cached for the duration set in Valid.
Attr(ctx context.Context, attr *fuse.Attr) error
}
type NodeGetattrer interface {
// Getattr obtains the standard metadata for the receiver.
// It should store that metadata in resp.
//
// If this method is not implemented, the attributes will be
// generated based on Attr(), with zero values filled in.
Getattr(ctx context.Context, req *fuse.GetattrRequest, resp *fuse.GetattrResponse) error
}
type NodeSetattrer interface {
// Setattr sets the standard metadata for the receiver.
//
// Note, this is also used to communicate changes in the size of
// the file, outside of Writes.
//
// req.Valid is a bitmask of what fields are actually being set.
// For example, the method should not change the mode of the file
// unless req.Valid.Mode() is true.
Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error
}
type NodeSymlinker interface {
// Symlink creates a new symbolic link in the receiver, which must be a directory.
//
// TODO is the above true about directories?
Symlink(ctx context.Context, req *fuse.SymlinkRequest) (Node, error)
}
// This optional request will be called only for symbolic link nodes.
type NodeReadlinker interface {
// Readlink reads a symbolic link.
Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (string, error)
}
type NodeLinker interface {
// Link creates a new directory entry in the receiver based on an
// existing Node. Receiver must be a directory.
Link(ctx context.Context, req *fuse.LinkRequest, old Node) (Node, error)
}
type NodeRemover interface {
// Remove removes the entry with the given name from
// the receiver, which must be a directory. The entry to be removed
// may correspond to a file (unlink) or to a directory (rmdir).
Remove(ctx context.Context, req *fuse.RemoveRequest) error
}
type NodeAccesser interface {
// Access checks whether the calling context has permission for
// the given operations on the receiver. If so, Access should
// return nil. If not, Access should return EPERM.
//
// Note that this call affects the result of the access(2) system
// call but not the open(2) system call. If Access is not
// implemented, the Node behaves as if it always returns nil
// (permission granted), relying on checks in Open instead.
Access(ctx context.Context, req *fuse.AccessRequest) error
}
type NodeStringLookuper interface {
// Lookup looks up a specific entry in the receiver,
// which must be a directory. Lookup should return a Node
// corresponding to the entry. If the name does not exist in
// the directory, Lookup should return ENOENT.
//
// Lookup need not to handle the names "." and "..".
Lookup(ctx context.Context, name string) (Node, error)
}
type NodeRequestLookuper interface {
// Lookup looks up a specific entry in the receiver.
// See NodeStringLookuper for more.
Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.LookupResponse) (Node, error)
}
type NodeMkdirer interface {
Mkdir(ctx context.Context, req *fuse.MkdirRequest) (Node, error)
}
type NodeOpener interface {
// Open opens the receiver. After a successful open, a client
// process has a file descriptor referring to this Handle.
//
// Open can also be also called on non-files. For example,
// directories are Opened for ReadDir or fchdir(2).
//
// If this method is not implemented, the open will always
// succeed, and the Node itself will be used as the Handle.
//
// XXX note about access. XXX OpenFlags.
Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenResponse) (Handle, error)
}
type NodeCreater interface {
// Create creates a new directory entry in the receiver, which
// must be a directory.
Create(ctx context.Context, req *fuse.CreateRequest, resp *fuse.CreateResponse) (Node, Handle, error)
}
type NodeForgetter interface {
// Forget about this node. This node will not receive further
// method calls.
//
// Forget is not necessarily seen on unmount, as all nodes are
// implicitly forgotten as part part of the unmount.
Forget()
}
type NodeRenamer interface {
Rename(ctx context.Context, req *fuse.RenameRequest, newDir Node) error
}
type NodeMknoder interface {
Mknod(ctx context.Context, req *fuse.MknodRequest) (Node, error)
}
// TODO this should be on Handle not Node
type NodeFsyncer interface {
Fsync(ctx context.Context, req *fuse.FsyncRequest) error
}
type NodeGetxattrer interface {
// Getxattr gets an extended attribute by the given name from the
// node.
//
// If there is no xattr by that name, returns fuse.ErrNoXattr.
Getxattr(ctx context.Context, req *fuse.GetxattrRequest, resp *fuse.GetxattrResponse) error
}
type NodeListxattrer interface {
// Listxattr lists the extended attributes recorded for the node.
Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error
}
type NodeSetxattrer interface {
// Setxattr sets an extended attribute with the given name and
// value for the node.
Setxattr(ctx context.Context, req *fuse.SetxattrRequest) error
}
type NodeRemovexattrer interface {
// Removexattr removes an extended attribute for the name.
//
// If there is no xattr by that name, returns fuse.ErrNoXattr.
Removexattr(ctx context.Context, req *fuse.RemovexattrRequest) error
}
var startTime = time.Now()
func nodeAttr(ctx context.Context, n Node, attr *fuse.Attr) error {
attr.Valid = attrValidTime
attr.Nlink = 1
attr.Atime = startTime
attr.Mtime = startTime
attr.Ctime = startTime
attr.Crtime = startTime
if err := n.Attr(ctx, attr); err != nil {
return err
}
return nil
}
// A Handle is the interface required of an opened file or directory.
// See the documentation for type FS for general information
// pertaining to all methods.
//
// Other FUSE requests can be handled by implementing methods from the
// Handle* interfaces. The most common to implement are HandleReader,
// HandleReadDirer, and HandleWriter.
//
// TODO implement methods: Getlk, Setlk, Setlkw
type Handle interface {
}
type HandleFlusher interface {
// Flush is called each time the file or directory is closed.
// Because there can be multiple file descriptors referring to a
// single opened file, Flush can be called multiple times.
Flush(ctx context.Context, req *fuse.FlushRequest) error
}
type HandleReadAller interface {
ReadAll(ctx context.Context) ([]byte, error)
}
type HandleReadDirer interface {
ReadDir(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) ([]fuse.Dirent, error)
}
type HandleReadDirAller interface {
ReadDirAll(ctx context.Context) ([]fuse.Dirent, error)
}
type HandleReader interface {
// Read requests to read data from the handle.
//
// There is a page cache in the kernel that normally submits only
// page-aligned reads spanning one or more pages. However, you
// should not rely on this. To see individual requests as
// submitted by the file system clients, set OpenDirectIO.
//
// Note that reads beyond the size of the file as reported by Attr
// are not even attempted (except in OpenDirectIO mode).
Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) error
}
type HandleWriter interface {
// Write requests to write data into the handle at the given offset.
// Store the amount of data written in resp.Size.
//
// There is a writeback page cache in the kernel that normally submits
// only page-aligned writes spanning one or more pages. However,
// you should not rely on this. To see individual requests as
// submitted by the file system clients, set OpenDirectIO.
//
// Writes that grow the file are expected to update the file size
// (as seen through Attr). Note that file size changes are
// communicated also through Setattr.
Write(ctx context.Context, req *fuse.WriteRequest, resp *fuse.WriteResponse) error
}
type HandleReleaser interface {
Release(ctx context.Context, req *fuse.ReleaseRequest) error
}
type Config struct {
// Function to send debug log messages to. If nil, use fuse.Debug.
// Note that changing this or fuse.Debug may not affect existing
// calls to Serve.
//
// See fuse.Debug for the rules that log functions must follow.
Debug func(msg interface{})
// Function to put things into context for processing the request.
// The returned context must have ctx as its parent.
//
// Note that changing this may not affect existing calls to Serve.
//
// Must not retain req.
WithContext func(ctx context.Context, req fuse.Request) context.Context
}
// New returns a new FUSE server ready to serve this kernel FUSE
// connection.
//
// Config may be nil.
func New(conn *fuse.Conn, config *Config) *Server {
s := &Server{
conn: conn,
req: map[fuse.RequestID]*serveRequest{},
nodeRef: map[Node]fuse.NodeID{},
dynamicInode: GenerateDynamicInode,
}
if config != nil {
s.debug = config.Debug
s.context = config.WithContext
}
if s.debug == nil {
s.debug = fuse.Debug
}
return s
}
type Server struct {
// set in New
conn *fuse.Conn
debug func(msg interface{})
context func(ctx context.Context, req fuse.Request) context.Context
// set once at Serve time
fs FS
dynamicInode func(parent uint64, name string) uint64
// state, protected by meta
meta sync.Mutex
req map[fuse.RequestID]*serveRequest
node []*serveNode
nodeRef map[Node]fuse.NodeID
handle []*serveHandle
freeNode []fuse.NodeID
freeHandle []fuse.HandleID
nodeGen uint64
// Allocated to ensure worker goroutines finish before Serve returns
wg sync.WaitGroup
}
const (
ContextNodeVersionV1 uint32 = 1
ContextHandleVersionV1 uint32 = 1
ContextNodeVersion uint32 = ContextNodeVersionV1
ContextHandleVersion uint32 = ContextHandleVersionV1
NodeListFileName string = "/tmp/CubeFS-fuse-Nodes.list"
HandleListFileName string = "/tmp/CubeFS-fuse-Handles.list"
)
func WriteVersion(file *os.File, version uint32) error {
data := make([]byte, 4)
binary.BigEndian.PutUint32(data, version)
_, err := file.Write(data)
return err
}
func ReadVersion(file *os.File) (uint32, error) {
data := make([]byte, 4)
_, err := file.Read(data)
if err != nil {
return 0, err
}
version := binary.BigEndian.Uint32(data)
return version, nil
}
type ContextNode struct {
Inode uint64
ParentIno uint64
Generation uint64
Refs uint64
NodeID uint64
Mode uint32
Rsvd uint32
}
func (cn *ContextNode) String() string {
return fmt.Sprintf("nodeid:%v inode:%v parent:%v gen:%v refs:%v mode:%o",
cn.NodeID, cn.Inode, cn.ParentIno, cn.Generation, cn.Refs, cn.Mode)
}
func ContextNodeToBytes(cn *ContextNode) []byte {
var buf []byte = make([]byte, unsafe.Sizeof(ContextNode{}))
binary.BigEndian.PutUint64(buf[0:8], cn.Inode)
binary.BigEndian.PutUint64(buf[8:16], cn.ParentIno)
binary.BigEndian.PutUint64(buf[16:24], cn.Generation)
binary.BigEndian.PutUint64(buf[24:32], cn.Refs)
binary.BigEndian.PutUint64(buf[32:40], cn.NodeID)
binary.BigEndian.PutUint32(buf[40:44], cn.Mode)
return buf
}
func ContextNodeFromBytes(buf []byte) *ContextNode {
cn := &ContextNode{}
cn.Inode = binary.BigEndian.Uint64(buf[0:8])
cn.ParentIno = binary.BigEndian.Uint64(buf[8:16])
cn.Generation = binary.BigEndian.Uint64(buf[16:24])
cn.Refs = binary.BigEndian.Uint64(buf[24:32])
cn.NodeID = binary.BigEndian.Uint64(buf[32:40])
cn.Mode = binary.BigEndian.Uint32(buf[40:44])
return cn
}
type ContextHandle struct {
HandleID uint64
NodeID uint64
}
func (ch *ContextHandle) String() string {
return fmt.Sprintf("handleid:%v nodeid:%v", ch.HandleID, ch.NodeID)
}
func ContextHandleToBytes(ch *ContextHandle) []byte {
var buf []byte = make([]byte, unsafe.Sizeof(ContextHandle{}))
binary.BigEndian.PutUint64(buf[0:8], ch.HandleID)
binary.BigEndian.PutUint64(buf[8:16], ch.NodeID)
return buf
}
func ContextHandleFromBytes(buf []byte) *ContextHandle {
ch := &ContextHandle{}
ch.HandleID = binary.BigEndian.Uint64(buf[0:8])
ch.NodeID = binary.BigEndian.Uint64(buf[8:16])
return ch
}
func (s *Server) TrySuspend(fs FS) bool {
var err error
var msg string
var ret bool
stat, sockaddr := fs.State()
if stat == FSStatSuspend {
if msg, err = s.SaveFuseContext(fs); err != nil {
s.CleanupFuseContext()
fs.Notify(stat, err)
goto out
}
if err = s.SaveFuseDevFd(sockaddr); err != nil {
s.CleanupFuseContext()
fs.Notify(stat, err)
goto out
}
fs.Notify(stat, msg)
out:
for {
stat, _ = fs.State()
if stat == FSStatShutdown {
ret = true
break
} else if stat == FSStatResume {
s.CleanupFuseContext()
ret = false
break
} else {
runtime.Gosched()
}
}
}
return ret
}
func (s *Server) CleanupFuseContext() {
os.Remove(NodeListFileName)
os.Remove(HandleListFileName)
}
func (s *Server) SaveFuseContext(fs FS) (msg string, err error) {
var (
nodeListFile *os.File
handleListFile *os.File
ncount int
hcount int
skip uint64
)
// Wait all received requests to finish
// FIXME: add a timeout to avoid waiting forever
s.wg.Wait()
if nodeListFile, err = os.OpenFile(NodeListFileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644); err != nil {
err = fmt.Errorf("SaveFuseContext: failed to create nodes list file: %v", err)
return
}
defer nodeListFile.Close()
if handleListFile, err = os.OpenFile(HandleListFileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644); err != nil {
err = fmt.Errorf("SaveFuseContext: failed to create s list file: %v", err)
return
}
defer handleListFile.Close()
if err = WriteVersion(nodeListFile, ContextNodeVersion); err != nil {
err = fmt.Errorf("SaveFuseContext: failed to write nodes list file: %v", err)
return
}
if err = WriteVersion(handleListFile, ContextHandleVersion); err != nil {
err = fmt.Errorf("SaveFuseContext: failed to write handles list file: %v", err)
return
}
s.meta.Lock()
// s.node[0] is nil and s.node[1] is root.
// No need to save root since it is created everytime fuse is mounted.
skip = 2
for i, sn := range s.node[skip:] {
var (
attr fuse.Attr = fuse.Attr{}
nodeid uint64 = skip + uint64(i)
n int
)
if sn == nil {
continue
}
sn.wg.Wait()
if err = sn.node.Attr(context.TODO(), &attr); err != nil {
s.meta.Unlock()
err = fmt.Errorf("SaveFuseContext: failed to get mode of node %v: %v", sn.inode, err)
return
}
cn := &ContextNode{sn.inode, attr.ParentIno, sn.generation, sn.refs, nodeid, uint32(attr.Mode), 0}
data := ContextNodeToBytes(cn)
if n, err = nodeListFile.Write(data); n != len(data) || err != nil {
s.meta.Unlock()
err = fmt.Errorf("SaveFuseContext: failed to write nodes list file: %v", err)
return
}
ncount++
// check if need stop
if ncount%20 == 0 {
stat, _ := fs.State()
if stat != FSStatSuspend {
s.meta.Unlock()
err = fmt.Errorf("SaveFuseContext: detect state changed to %v", stat)
return
}
}
}
skip = 1
for i, sh := range s.handle[skip:] {
var (
handleid uint64 = skip + uint64(i)
n int
)
if sh == nil {
continue
}
if hdl, ok := sh.handle.(HandleFlusher); ok {
if err = hdl.Flush(context.TODO(), nil); err != nil {
s.meta.Unlock()
err = fmt.Errorf("SaveFuseContext: flush handle %v: %v\n",
s.node[sh.nodeID].inode, err)
return
}
}
ch := &ContextHandle{handleid, uint64(sh.nodeID)}
data := ContextHandleToBytes(ch)
if n, err = handleListFile.Write(data); n != len(data) || err != nil {
s.meta.Unlock()
err = fmt.Errorf("SaveFuseContext: failed to write handles list file: %v", err)
return
}
hcount++
// check 'if' need stop
if hcount%20 == 0 {
stat, _ := fs.State()
if stat != FSStatSuspend {
s.meta.Unlock()
err = fmt.Errorf("SaveFuseContext: detect state changed to %v", stat)
return
}
}
}
s.meta.Unlock()
if err = nodeListFile.Sync(); err != nil {
err = fmt.Errorf("SaveFuseContext: failed to sync nodes list file: %v", err)
return
}
if err = handleListFile.Sync(); err != nil {
err = fmt.Errorf("SaveFuseContext: failed to sync handles list file: %v", err)
return
}
msg = fmt.Sprintf("Node count: %d Handle count: %d", ncount, hcount)
return
}
func (s *Server) SaveFuseDevFd(sockaddr string) (err error) {
var addr *net.UnixAddr
var conn *net.UnixConn
var fud *os.File
var socket *os.File
defer func() {
if socket != nil {
socket.Close()
}
if conn != nil {
conn.Close()
}
}()
if addr, err = net.ResolveUnixAddr("unix", sockaddr); err != nil {
return fmt.Errorf("SaveFuseDevFd: failed to create unix addr: %v", err)
}
if conn, err = net.DialUnix("unix", nil, addr); err != nil {
return fmt.Errorf("SaveFuseDevFd: failed to connect unix socket: %v", err)
}
if socket, err = conn.File(); err != nil {
return fmt.Errorf("SaveFuseDevFd: failed to get socket file: %v", err)
}
fud = s.conn.GetFuseDevFile()
if fud == nil {
return fmt.Errorf("SaveFuseDevFd: fuse dev not exist")
}
if err = util.SendFd(socket, fud.Name(), fud.Fd()); err != nil {
return fmt.Errorf("SaveFuseDevFd: failed to send fuse dev file: %v", err)
}
return nil
}
func (s *Server) TryRestore(fs FS) error {
stat, sockaddr := fs.State()
if stat != FSStatRestore {
return nil
}
err := s.LoadFuseContext(fs, sockaddr)
if err != nil {
return err
}
if s.conn.GetFuseDevFile() == nil {
if err = s.LoadFuseDevFd(sockaddr); err != nil {
return err
}
}
fs.Notify(stat, "")
for {
stat, _ = fs.State()
if stat == FSStatResume {
//s.CleanupFuseContext()
break
} else if stat == FSStatRestore {
runtime.Gosched()
} else {
return fmt.Errorf("Unknown state changed %v", stat)
}
}
return nil
}
func (s *Server) LoadFuseContext(fs FS, sockaddr string) error {
nodeListFile, err := os.OpenFile(NodeListFileName, os.O_RDONLY, 0644)
if err != nil {
err = fmt.Errorf("LoadFuseContext: failed to open nodes list file: %v\n", err)
return err
}
defer nodeListFile.Close()
handleListFile, err := os.OpenFile(HandleListFileName, os.O_RDONLY, 0644)
if err != nil {
err = fmt.Errorf("LoadFuseContext: failed to open handles list file: %v\n", err)
return err
}
defer handleListFile.Close()
cnVersion, err := ReadVersion(nodeListFile)
if err != nil {
err = fmt.Errorf("LoadFuseContext: failed to read nodes version: %v\n", err)
return err
}
chVersion, err := ReadVersion(handleListFile)
if err != nil {
err = fmt.Errorf("LoadFuseContext: failed to read handles version: %v\n", err)
return err
}
for {
var (
data []byte = make([]byte, unsafe.Sizeof(ContextNode{}))
rsize int
)
rsize, err = nodeListFile.Read(data)
if rsize == 0 || err == io.EOF {
err = nil
break
}
if cnVersion == ContextNodeVersionV1 {
cn := ContextNodeFromBytes(data)
sn := &serveNode{inode: cn.Inode, generation: cn.Generation, refs: cn.Refs}
if sn.node, err = fs.Node(cn.Inode, cn.ParentIno, cn.Mode); err != nil {
err = fmt.Errorf("LoadFuseContext: failed to get fs.Node of %v: %v\n", sn.inode, err)
return err
}
for uint64(len(s.node)) < cn.NodeID {
freeNodeID := fuse.NodeID(len(s.node))
s.freeNode = append(s.freeNode, freeNodeID)
s.node = append(s.node, nil)
}
s.node = append(s.node, sn)
s.nodeRef[sn.node] = fuse.NodeID(cn.NodeID)
} else {
err = fmt.Errorf("LoadFuseContext: unrecognize nodes file version %v\n", cnVersion)
return err
}
}
for {
var (
data []byte = make([]byte, unsafe.Sizeof(ContextHandle{}))
rsize int
hdl Handle
)
rsize, err = handleListFile.Read(data)
if rsize == 0 || err == io.EOF {
err = nil
break
}
if chVersion == ContextHandleVersionV1 {
ch := ContextHandleFromBytes(data)
if ch.NodeID > uint64(len(s.node)) {
err = fmt.Errorf("LoadFuseContext: invalid handle(%v) len of s.node %v\n",
ch, len(s.node))
return err
}
sn := s.node[ch.NodeID]
if node, ok := sn.node.(NodeOpener); ok {
// create streamers for cubefs
if hdl, err = node.Open(context.TODO(), nil, nil); err != nil {
err = fmt.Errorf("LoadFuseContext: failed to open handle %v: %v\n", sn.inode, err)
return err
}
} else {
hdl = sn.node
}
sh := &serveHandle{handle: hdl, nodeID: fuse.NodeID(ch.NodeID)}
for uint64(len(s.handle)) < ch.HandleID {
freeHandleID := fuse.HandleID(len(s.handle))
s.freeHandle = append(s.freeHandle, freeHandleID)
s.handle = append(s.handle, nil)
}
s.handle = append(s.handle, sh)
} else {
err = fmt.Errorf("LoadFuseContext: unrecognize handles file version %v\n", chVersion)
return err
}
}
return err
}
func (s *Server) LoadFuseDevFd(sockaddr string) (err error) {
var (
addr *net.UnixAddr
conn *net.UnixConn
fud *os.File
socket *os.File
)
defer func() {
if socket != nil {
socket.Close()
}
if conn != nil {
conn.Close()
}
}()
if addr, err = net.ResolveUnixAddr("unix", sockaddr); err != nil {
err = fmt.Errorf("LoadFuseDevFd: failed to create unix addr: %v", err)
return
}
if conn, err = net.DialUnix("unix", nil, addr); err != nil {
err = fmt.Errorf("LoadFuseDevFd: failed to connect unix socket: %v", err)
return
}
if socket, err = conn.File(); err != nil {
err = fmt.Errorf("LoadFuseDevFd: failed to get socket file: %v", err)
return
}
if fud, err = util.RecvFd(socket); err != nil {
err = fmt.Errorf("LoadFuseDevFd: failed to receive fuse dev file: %v", err)
return
}
s.conn.SetFuseDevFile(fud)
return
}
// Serve serves the FUSE connection by making calls to the methods
// of fs and the Nodes and Handles it makes available. It returns only
// when the connection has been closed or an unexpected error occurs.
func (s *Server) Serve(fs FS, opt *proto.MountOptions) error {
defer s.wg.Wait() // Wait for worker goroutines to complete before return
s.fs = fs
if dyn, ok := fs.(FSInodeGenerator); ok {
s.dynamicInode = dyn.GenerateInode
}
root, err := fs.Root()
if err != nil {
return fmt.Errorf("cannot obtain root node: %v", err)
}
// Recognize the root node if it's ever returned from Lookup,
// passed to Invalidate, etc.
s.nodeRef[root] = 1
s.node = append(s.node, nil, &serveNode{
inode: 1,
generation: s.nodeGen,
node: root,
refs: 1,
})
s.handle = append(s.handle, nil)
if err = s.TryRestore(fs); err != nil {
return fmt.Errorf("restore fail: %v", err)
}
for {
if s.TrySuspend(fs) {
break
}
req, err := s.conn.ReadRequest()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch req.(type) {
case *fuse.ForgetRequest:
ctx := context.Background()
ForgetServeLimit.Wait(ctx)
default:
}
s.wg.Add(1)
go func() {
defer s.wg.Done()
if opt != nil && opt.RequestTimeout > 0 {
s.serveWithTimeOut(req, opt.RequestTimeout)
} else {
s.serve(req)
}
}()
}
return nil
}
// Serve serves a FUSE connection with the default settings. See
// Server.Serve.
func Serve(c *fuse.Conn, fs FS, opt *proto.MountOptions) error {
server := New(c, nil)
return server.Serve(fs, opt)
}
type nothing struct{}
type serveRequest struct {
Request fuse.Request
cancel func()
}
type serveNode struct {
inode uint64
generation uint64
node Node
refs uint64
// Delay freeing the NodeID until waitgroup is done. This allows
// using the NodeID for short periods of time without holding the
// Server.meta lock.
//
// Rules:
//
// - hold Server.meta while calling wg.Add, then unlock
// - do NOT try to reacquire Server.meta
wg sync.WaitGroup
}
func (sn *serveNode) attr(ctx context.Context, attr *fuse.Attr) error {
err := nodeAttr(ctx, sn.node, attr)
if attr.Inode == 0 {
attr.Inode = sn.inode
}
return err
}
type serveHandle struct {
handle Handle
readData []byte
nodeID fuse.NodeID
}
// NodeRef is deprecated. It remains here to decrease code churn on
// FUSE library users. You may remove it from your program now;
// returning the same Node values are now recognized automatically,
// without needing NodeRef.
type NodeRef struct{}
func (c *Server) saveNode(inode uint64, node Node) (id fuse.NodeID, gen uint64) {
c.meta.Lock()
defer c.meta.Unlock()
if id, ok := c.nodeRef[node]; ok {
sn := c.node[id]
sn.refs++
return id, sn.generation
}
sn := &serveNode{inode: inode, node: node, refs: 1}
if n := len(c.freeNode); n > 0 {
id = c.freeNode[n-1]
c.freeNode = c.freeNode[:n-1]
c.node[id] = sn
c.nodeGen++
} else {
id = fuse.NodeID(len(c.node))
c.node = append(c.node, sn)
}
sn.generation = c.nodeGen
c.nodeRef[node] = id
return id, sn.generation
}
func (c *Server) saveHandle(handle Handle, nodeID fuse.NodeID) (id fuse.HandleID) {
c.meta.Lock()
shandle := &serveHandle{handle: handle, nodeID: nodeID}
if n := len(c.freeHandle); n > 0 {
id = c.freeHandle[n-1]
c.freeHandle = c.freeHandle[:n-1]
c.handle[id] = shandle
} else {
id = fuse.HandleID(len(c.handle))
c.handle = append(c.handle, shandle)
}
c.meta.Unlock()
return
}
type nodeRefcountDropBug struct {
N uint64
Refs uint64
Node fuse.NodeID
}
func (n *nodeRefcountDropBug) String() string {
return fmt.Sprintf("bug: trying to drop %d of %d references to %v", n.N, n.Refs, n.Node)
}
func (c *Server) dropNode(id fuse.NodeID, n uint64) (forget bool) {
c.meta.Lock()
defer c.meta.Unlock()
snode := c.node[id]
if snode == nil {
// this should only happen if refcounts kernel<->us disagree
// *and* two ForgetRequests for the same node race each other;
// this indicates a bug somewhere
c.debug(nodeRefcountDropBug{N: n, Node: id})
// we may end up triggering Forget twice, but that's better
// than not even once, and that's the best we can do
return true
}
if n > snode.refs {
c.debug(nodeRefcountDropBug{N: n, Refs: snode.refs, Node: id})
n = snode.refs
}
snode.refs -= n
if snode.refs == 0 {
snode.wg.Wait()
c.node[id] = nil
delete(c.nodeRef, snode.node)
c.freeNode = append(c.freeNode, id)
return true
}
return false
}
func (c *Server) dropHandle(id fuse.HandleID) {
c.meta.Lock()
c.handle[id] = nil
c.freeHandle = append(c.freeHandle, id)
c.meta.Unlock()
}
type missingHandle struct {
Handle fuse.HandleID
MaxHandle fuse.HandleID
}
func (m missingHandle) String() string {
return fmt.Sprint("missing handle: ", m.Handle, m.MaxHandle)
}
// Returns nil for invalid handles.
func (c *Server) getHandle(id fuse.HandleID) (shandle *serveHandle) {
c.meta.Lock()
defer c.meta.Unlock()
if id < fuse.HandleID(len(c.handle)) {
shandle = c.handle[uint(id)]
}
if shandle == nil {
c.debug(missingHandle{
Handle: id,
MaxHandle: fuse.HandleID(len(c.handle)),
})
}
return
}
type request struct {
Op string
Request *fuse.Header
In interface{} `json:",omitempty"`
}
func (r request) String() string {
return fmt.Sprintf("<- %s", r.In)
}
type logResponseHeader struct {
ID fuse.RequestID
}
func (m logResponseHeader) String() string {
return fmt.Sprintf("ID=%v", m.ID)
}
type response struct {
Op string
Request logResponseHeader
Out interface{} `json:",omitempty"`
// Errno contains the errno value as a string, for example "EPERM".
Errno string `json:",omitempty"`
// Error may contain a free form error message.
Error string `json:",omitempty"`
}
func (r response) errstr() string {
s := r.Errno
if r.Error != "" {
// prefix the errno constant to the long form message
s = s + ": " + r.Error
}
return s
}
func (r response) String() string {
switch {
case r.Errno != "" && r.Out != nil:
return fmt.Sprintf("-> [%v] %v error=%s", r.Request, r.Out, r.errstr())
case r.Errno != "":
return fmt.Sprintf("-> [%v] %s error=%s", r.Request, r.Op, r.errstr())
case r.Out != nil:
// make sure (seemingly) empty values are readable
switch r.Out.(type) {
case string:
return fmt.Sprintf("-> [%v] %s %q", r.Request, r.Op, r.Out)
case []byte:
return fmt.Sprintf("-> [%v] %s [% x]", r.Request, r.Op, r.Out)
default:
return fmt.Sprintf("-> [%v] %v", r.Request, r.Out)
}
default:
return fmt.Sprintf("-> [%v] %s", r.Request, r.Op)
}
}
type notification struct {
Op string
Node fuse.NodeID
Out interface{} `json:",omitempty"`
Err string `json:",omitempty"`
}
func (n notification) String() string {
var buf bytes.Buffer
fmt.Fprintf(&buf, "=> %s %v", n.Op, n.Node)
if n.Out != nil {
// make sure (seemingly) empty values are readable
switch n.Out.(type) {
case string:
fmt.Fprintf(&buf, " %q", n.Out)
case []byte:
fmt.Fprintf(&buf, " [% x]", n.Out)
default:
fmt.Fprintf(&buf, " %s", n.Out)
}
}
if n.Err != "" {
fmt.Fprintf(&buf, " Err:%v", n.Err)
}
return buf.String()
}
type logMissingNode struct {
MaxNode fuse.NodeID
}
func opName(req fuse.Request) string {
t := reflect.Indirect(reflect.ValueOf(req)).Type()
s := t.Name()
s = strings.TrimSuffix(s, "Request")
return s
}
type logLinkRequestOldNodeNotFound struct {
Request *fuse.Header
In *fuse.LinkRequest
}
func (m *logLinkRequestOldNodeNotFound) String() string {
return fmt.Sprintf("In LinkRequest (request %v), node %d not found", m.Request.Hdr().ID, m.In.OldNode)
}
type renameNewDirNodeNotFound struct {
Request *fuse.Header
In *fuse.RenameRequest
}
func (m *renameNewDirNodeNotFound) String() string {
return fmt.Sprintf("In RenameRequest (request %v), node %d not found", m.Request.Hdr().ID, m.In.NewDir)
}
type handlerPanickedError struct {
Request interface{}
Err interface{}
}
var _ error = handlerPanickedError{}
func (h handlerPanickedError) Error() string {
return fmt.Sprintf("handler panicked: %v", h.Err)
}
var _ fuse.ErrorNumber = handlerPanickedError{}
func (h handlerPanickedError) Errno() fuse.Errno {
if err, ok := h.Err.(fuse.ErrorNumber); ok {
return err.Errno()
}
return fuse.DefaultErrno
}
// handlerTerminatedError happens when a handler terminates itself
// with runtime.Goexit. This is most commonly because of incorrect use
// of testing.TB.FailNow, typically via t.Fatal.
type handlerTerminatedError struct {
Request interface{}
}
var _ error = handlerTerminatedError{}
func (h handlerTerminatedError) Error() string {
return fmt.Sprintf("handler terminated (called runtime.Goexit)")
}
var _ fuse.ErrorNumber = handlerTerminatedError{}
func (h handlerTerminatedError) Errno() fuse.Errno {
return fuse.DefaultErrno
}
type handleNotReaderError struct {
handle Handle
}
var _ error = handleNotReaderError{}
func (e handleNotReaderError) Error() string {
return fmt.Sprintf("handle has no Read: %T", e.handle)
}
var _ fuse.ErrorNumber = handleNotReaderError{}
func (e handleNotReaderError) Errno() fuse.Errno {
return fuse.ENOTSUP
}
func initLookupResponse(s *fuse.LookupResponse) {
s.EntryValid = entryValidTime
}
func (c *Server) serve(r fuse.Request) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
parentCtx := ctx
if c.context != nil {
ctx = c.context(ctx, r)
}
req := &serveRequest{Request: r, cancel: cancel}
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("fuse:"+opName(r), nil, bgTime, 1)
}()
c.debug(request{
Op: opName(r),
Request: r.Hdr(),
In: r,
})
node, snode, hdr, ok := c.checkNode(r, req)
if ok {
return
}
done := c.done(r, hdr)
var responded bool
defer func() {
if rec := recover(); rec != nil {
const size = 1 << 16
buf := make([]byte, size)
n := runtime.Stack(buf, false)
buf = buf[:n]
log.Printf("fuse: panic in handler for %v: %v\n%s", r, rec, buf)
err := handlerPanickedError{
Request: r,
Err: rec,
}
done(err)
r.RespondError(err)
return
}
if !responded {
err := handlerTerminatedError{
Request: r,
}
done(err)
r.RespondError(err)
}
}()
if err := c.handleRequest(ctx, node, snode, r, done); err != nil {
if err == context.Canceled {
select {
case <-parentCtx.Done():
// We canceled the parent context because of an
// incoming interrupt request, so return EINTR
// to trigger the right behavior in the client app.
//
// Only do this when it's the parent context that was
// canceled, not a context controlled by the program
// using this library, so we don't return EINTR too
// eagerly -- it might cause busy loops.
//
// Decent write-up on role of EINTR:
// http://250bpm.com/blog:12
err = fuse.EINTR
default:
// nothing
}
}
done(err)
r.RespondError(err)
}
// disarm runtime.Goexit protection
responded = true
}
func (c *Server) done(r fuse.Request, hdr *fuse.Header) func(resp interface{}) {
// Call this before responding.
// After responding is too late: we might get another request
// with the same ID and be very confused.
done := func(resp interface{}) {
msg := response{
Op: opName(r),
Request: logResponseHeader{ID: hdr.ID},
}
if err, ok := resp.(error); ok {
msg.Error = err.Error()
if ferr, ok := err.(fuse.ErrorNumber); ok {
errno := ferr.Errno()
msg.Errno = errno.ErrnoName()
if errno == err {
// it's just a fuse.Errno with no extra detail;
// skip the textual message for log readability
msg.Error = ""
}
} else {
msg.Errno = fuse.DefaultErrno.ErrnoName()
}
} else {
msg.Out = resp
}
c.debug(msg)
c.meta.Lock()
delete(c.req, hdr.ID)
c.meta.Unlock()
}
return done
}
func (c *Server) checkNode(r fuse.Request, req *serveRequest) (Node, *serveNode, *fuse.Header, bool) {
var node Node
var snode *serveNode
c.meta.Lock()
hdr := r.Hdr()
if id := hdr.Node; id != 0 {
if id < fuse.NodeID(len(c.node)) {
snode = c.node[uint(id)]
}
if snode == nil {
c.meta.Unlock()
c.debug(response{
Op: opName(r),
Request: logResponseHeader{ID: hdr.ID},
Error: fuse.ESTALE.ErrnoName(),
// this is the only place that sets both Error and
// Out; not sure if i want to do that; might get rid
// of len(c.node) things altogether
Out: logMissingNode{
MaxNode: fuse.NodeID(len(c.node)),
},
})
r.RespondError(fuse.ESTALE)
return nil, nil, nil, true
}
node = snode.node
}
if c.req[hdr.ID] != nil {
// This happens with OSXFUSE. Assume it's okay and
// that we'll never see an interrupt for this one.
// Otherwise everything wedges. TODO: Report to OSXFUSE?
//
// TODO this might have been because of missing done() calls
} else {
c.req[hdr.ID] = req
}
c.meta.Unlock()
return node, snode, hdr, false
}
func (c *Server) serveWithTimeOut(r fuse.Request, requestTimeout int64) {
ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(requestTimeout))
defer cancel()
doneChan := make(chan error, 1)
parentCtx := ctx
if c.context != nil {
ctx = c.context(ctx, r)
}
req := &serveRequest{Request: r, cancel: cancel}
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("fuse:"+opName(r), nil, bgTime, 1)
}()
c.debug(request{
Op: opName(r),
Request: r.Hdr(),
In: r,
})
node, snode, hdr, ok := c.checkNode(r, req)
if ok {
return
}
done := c.done(r, hdr)
go func() {
defer func() {
if rec := recover(); rec != nil {
const size = 1 << 16
buf := make([]byte, size)
n := runtime.Stack(buf, false)
buf = buf[:n]
log.Printf("fuse: panic in handler for %v: %v\n%s", r, rec, buf)
err := handlerPanickedError{
Request: r,
Err: rec,
}
done(err)
r.RespondError(err)
return
}
}()
doneChan <- c.handleRequest(ctx, node, snode, r, done)
}()
select {
case err := <-doneChan:
if err != nil {
if err == context.Canceled {
select {
case <-parentCtx.Done():
err = fuse.EINTR
default:
// nothing
}
}
done(err)
r.RespondError(err)
}
case <-ctx.Done():
err := ctx.Err()
if err != nil {
if err.Error() == "context canceled" {
// Context is finished, ignore
} else if err.Error() == "context deadline exceeded" {
log.Printf("request timeout, err: [%v], req: [%v], conn: [%v], pid: [%v]", ctx.Err(), r, r.Hdr().Conn, r.Hdr().Pid)
done(fuse.ETIME)
r.RespondError(fuse.ETIME)
} else {
done(fuse.EIO)
r.RespondError(fuse.EIO)
}
}
}
}
// handleRequest will either a) call done(s) and r.Respond(s) OR b) return an error.
func (c *Server) handleRequest(ctx context.Context, node Node, snode *serveNode, r fuse.Request, done func(resp interface{})) error {
switch r := r.(type) {
default:
// Note: To FUSE, ENOSYS means "this server never implements this request."
// It would be inappropriate to return ENOSYS for other operations in this
// switch that might only be unavailable in some contexts, not all.
return fuse.ENOSYS
case *fuse.StatfsRequest:
s := &fuse.StatfsResponse{}
if fs, ok := c.fs.(FSStatfser); ok {
if err := fs.Statfs(ctx, r, s); err != nil {
return err
}
}
done(s)
r.Respond(s)
return nil
// Node operations.
case *fuse.GetattrRequest:
s := &fuse.GetattrResponse{}
if n, ok := node.(NodeGetattrer); ok {
if err := n.Getattr(ctx, r, s); err != nil {
return err
}
} else {
if err := snode.attr(ctx, &s.Attr); err != nil {
return err
}
}
done(s)
r.Respond(s)
return nil
case *fuse.SetattrRequest:
s := &fuse.SetattrResponse{}
if n, ok := node.(NodeSetattrer); ok {
if err := n.Setattr(ctx, r, s); err != nil {
return err
}
}
if err := snode.attr(ctx, &s.Attr); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
case *fuse.SymlinkRequest:
s := &fuse.SymlinkResponse{}
initLookupResponse(&s.LookupResponse)
n, ok := node.(NodeSymlinker)
if !ok {
return fuse.EIO // XXX or EPERM like Mkdir?
}
n2, err := n.Symlink(ctx, r)
if err != nil {
return err
}
if err := c.saveLookup(ctx, &s.LookupResponse, snode, r.NewName, n2); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
case *fuse.ReadlinkRequest:
n, ok := node.(NodeReadlinker)
if !ok {
return fuse.EIO /// XXX or EPERM?
}
target, err := n.Readlink(ctx, r)
if err != nil {
return err
}
done(target)
r.Respond(target)
return nil
case *fuse.LinkRequest:
n, ok := node.(NodeLinker)
if !ok {
return fuse.EIO /// XXX or EPERM?
}
c.meta.Lock()
var oldNode *serveNode
if int(r.OldNode) < len(c.node) {
oldNode = c.node[r.OldNode]
}
c.meta.Unlock()
if oldNode == nil {
c.debug(logLinkRequestOldNodeNotFound{
Request: r.Hdr(),
In: r,
})
return fuse.EIO
}
n2, err := n.Link(ctx, r, oldNode.node)
if err != nil {
return err
}
s := &fuse.LookupResponse{}
initLookupResponse(s)
if err := c.saveLookup(ctx, s, snode, r.NewName, n2); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
case *fuse.RemoveRequest:
n, ok := node.(NodeRemover)
if !ok {
return fuse.EIO /// XXX or EPERM?
}
err := n.Remove(ctx, r)
if err != nil {
return err
}
done(nil)
r.Respond()
return nil
case *fuse.AccessRequest:
if n, ok := node.(NodeAccesser); ok {
if err := n.Access(ctx, r); err != nil {
return err
}
}
done(nil)
r.Respond()
return nil
case *fuse.LookupRequest:
var n2 Node
var err error
s := &fuse.LookupResponse{}
initLookupResponse(s)
if n, ok := node.(NodeStringLookuper); ok {
n2, err = n.Lookup(ctx, r.Name)
} else if n, ok := node.(NodeRequestLookuper); ok {
n2, err = n.Lookup(ctx, r, s)
} else {
return fuse.ENOENT
}
if err != nil {
return err
}
if err := c.saveLookup(ctx, s, snode, r.Name, n2); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
case *fuse.MkdirRequest:
s := &fuse.MkdirResponse{}
initLookupResponse(&s.LookupResponse)
n, ok := node.(NodeMkdirer)
if !ok {
return fuse.EPERM
}
n2, err := n.Mkdir(ctx, r)
if err != nil {
return err
}
if err := c.saveLookup(ctx, &s.LookupResponse, snode, r.Name, n2); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
case *fuse.OpenRequest:
s := &fuse.OpenResponse{}
var h2 Handle
if n, ok := node.(NodeOpener); ok {
hh, err := n.Open(ctx, r, s)
if err != nil {
return err
}
h2 = hh
} else {
h2 = node
}
s.Handle = c.saveHandle(h2, r.Hdr().Node)
done(s)
r.Respond(s)
return nil
case *fuse.CreateRequest:
n, ok := node.(NodeCreater)
if !ok {
// If we send back ENOSYS, FUSE will try mknod+open.
return fuse.EPERM
}
s := &fuse.CreateResponse{OpenResponse: fuse.OpenResponse{}}
initLookupResponse(&s.LookupResponse)
n2, h2, err := n.Create(ctx, r, s)
if err != nil {
return err
}
if err := c.saveLookup(ctx, &s.LookupResponse, snode, r.Name, n2); err != nil {
return err
}
s.Handle = c.saveHandle(h2, s.Node)
done(s)
r.Respond(s)
return nil
case *fuse.GetxattrRequest:
n, ok := node.(NodeGetxattrer)
if !ok {
return fuse.ENOTSUP
}
s := &fuse.GetxattrResponse{}
err := n.Getxattr(ctx, r, s)
if err != nil {
return err
}
if r.Size != 0 && uint64(len(s.Xattr)) > uint64(r.Size) {
return fuse.ERANGE
}
done(s)
r.Respond(s)
return nil
case *fuse.ListxattrRequest:
n, ok := node.(NodeListxattrer)
if !ok {
return fuse.ENOTSUP
}
s := &fuse.ListxattrResponse{}
err := n.Listxattr(ctx, r, s)
if err != nil {
return err
}
if r.Size != 0 && uint64(len(s.Xattr)) > uint64(r.Size) {
return fuse.ERANGE
}
done(s)
r.Respond(s)
return nil
case *fuse.SetxattrRequest:
log.Println("SetxattrRequest")
n, ok := node.(NodeSetxattrer)
if !ok {
return fuse.ENOTSUP
}
err := n.Setxattr(ctx, r)
if err != nil {
return err
}
done(nil)
r.Respond()
return nil
case *fuse.RemovexattrRequest:
n, ok := node.(NodeRemovexattrer)
if !ok {
return fuse.ENOTSUP
}
err := n.Removexattr(ctx, r)
if err != nil {
return err
}
done(nil)
r.Respond()
return nil
case *fuse.ForgetRequest:
forget := c.dropNode(r.Hdr().Node, r.N)
if forget {
n, ok := node.(NodeForgetter)
if ok {
n.Forget()
}
}
done(nil)
r.Respond()
return nil
// Handle operations.
case *fuse.ReadRequest:
shandle := c.getHandle(r.Handle)
if shandle == nil {
return fuse.ESTALE
}
handle := shandle.handle
s := &fuse.ReadResponse{}
if r.Dir {
s.Data = make([]byte, r.Size)
// detect rewinddir(3) or similar seek and refresh
// contents
if r.Offset == 0 {
shandle.readData = nil
}
if h, ok := handle.(HandleReadDirer); ok {
var noMore bool
for !noMore && ((shandle.readData == nil) || (r.Offset+int64(r.Size) > int64(len(shandle.readData)))) {
dirs, err := h.ReadDir(ctx, r, s)
if err != nil {
if err == io.EOF {
noMore = true
} else {
return err
}
}
for _, dir := range dirs {
if dir.Inode == 0 {
dir.Inode = c.dynamicInode(snode.inode, dir.Name)
}
shandle.readData = fuse.AppendDirent(shandle.readData, dir)
}
}
} else if h, ok := handle.(HandleReadDirAller); ok {
if shandle.readData == nil {
dirs, err := h.ReadDirAll(ctx)
if err != nil {
return err
}
var data []byte
for _, dir := range dirs {
if dir.Inode == 0 {
dir.Inode = c.dynamicInode(snode.inode, dir.Name)
}
data = fuse.AppendDirent(data, dir)
}
shandle.readData = data
}
}
fuseutil.HandleRead(r, s, shandle.readData)
} else {
s.Data = fuse.GetBlockBuf(r.Size)
if h, ok := handle.(HandleReadAller); ok {
if shandle.readData == nil {
data, err := h.ReadAll(ctx)
if err != nil {
return err
}
if data == nil {
data = []byte{}
}
shandle.readData = data
}
fuseutil.HandleRead(r, s, shandle.readData)
done(s)
r.Respond(s)
return nil
}
h, ok := handle.(HandleReader)
if !ok {
err := handleNotReaderError{handle: handle}
return err
}
if err := h.Read(ctx, r, s); err != nil {
return err
}
}
done(s)
r.Respond(s)
return nil
case *fuse.WriteRequest:
shandle := c.getHandle(r.Handle)
if shandle == nil {
return fuse.ESTALE
}
s := &fuse.WriteResponse{}
if h, ok := shandle.handle.(HandleWriter); ok {
if err := h.Write(ctx, r, s); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
}
return fuse.EIO
case *fuse.FlushRequest:
shandle := c.getHandle(r.Handle)
if shandle == nil {
return fuse.ESTALE
}
handle := shandle.handle
if h, ok := handle.(HandleFlusher); ok {
if err := h.Flush(ctx, r); err != nil {
return err
}
}
done(nil)
r.Respond()
return nil
case *fuse.ReleaseRequest:
shandle := c.getHandle(r.Handle)
if shandle == nil {
return fuse.ESTALE
}
handle := shandle.handle
// No matter what, release the handle.
c.dropHandle(r.Handle)
if h, ok := handle.(HandleReleaser); ok {
if err := h.Release(ctx, r); err != nil {
return err
}
}
done(nil)
r.Respond()
return nil
case *fuse.DestroyRequest:
if fs, ok := c.fs.(FSDestroyer); ok {
fs.Destroy()
}
done(nil)
r.Respond()
return nil
case *fuse.RenameRequest:
c.meta.Lock()
var newDirNode *serveNode
if int(r.NewDir) < len(c.node) {
newDirNode = c.node[r.NewDir]
}
c.meta.Unlock()
if newDirNode == nil {
c.debug(renameNewDirNodeNotFound{
Request: r.Hdr(),
In: r,
})
return fuse.EIO
}
n, ok := node.(NodeRenamer)
if !ok {
return fuse.EIO // XXX or EPERM like Mkdir?
}
err := n.Rename(ctx, r, newDirNode.node)
if err != nil {
return err
}
done(nil)
r.Respond()
return nil
case *fuse.MknodRequest:
n, ok := node.(NodeMknoder)
if !ok {
return fuse.EIO
}
n2, err := n.Mknod(ctx, r)
if err != nil {
return err
}
s := &fuse.LookupResponse{}
initLookupResponse(s)
if err := c.saveLookup(ctx, s, snode, r.Name, n2); err != nil {
return err
}
done(s)
r.Respond(s)
return nil
case *fuse.FsyncRequest:
n, ok := node.(NodeFsyncer)
if !ok {
return fuse.EIO
}
err := n.Fsync(ctx, r)
if err != nil {
return err
}
done(nil)
r.Respond()
return nil
case *fuse.InterruptRequest:
c.meta.Lock()
ireq := c.req[r.IntrID]
if ireq != nil && ireq.cancel != nil {
ireq.cancel()
ireq.cancel = nil
}
c.meta.Unlock()
done(nil)
r.Respond()
return nil
/* case *FsyncdirRequest:
return ENOSYS
case *GetlkRequest, *SetlkRequest, *SetlkwRequest:
return ENOSYS
case *BmapRequest:
return ENOSYS
case *SetvolnameRequest, *GetxtimesRequest, *ExchangeRequest:
return ENOSYS
*/
}
panic("not reached")
}
func (c *Server) saveLookup(ctx context.Context, s *fuse.LookupResponse, snode *serveNode, elem string, n2 Node) error {
if err := nodeAttr(ctx, n2, &s.Attr); err != nil {
return err
}
if s.Attr.Inode == 0 {
s.Attr.Inode = c.dynamicInode(snode.inode, elem)
}
s.Node, s.Generation = c.saveNode(s.Attr.Inode, n2)
return nil
}
type invalidateNodeDetail struct {
Off int64
Size int64
}
func (i invalidateNodeDetail) String() string {
return fmt.Sprintf("Off:%d Size:%d", i.Off, i.Size)
}
func errstr(err error) string {
if err == nil {
return ""
}
return err.Error()
}
func (s *Server) invalidateNode(node Node, off int64, size int64) error {
s.meta.Lock()
id, ok := s.nodeRef[node]
if ok {
snode := s.node[id]
snode.wg.Add(1)
defer snode.wg.Done()
}
s.meta.Unlock()
if !ok {
// This is what the kernel would have said, if we had been
// able to send this message; it's not cached.
return fuse.ErrNotCached
}
// Delay logging until after we can record the error too. We
// consider a /dev/fuse write to be instantaneous enough to not
// need separate before and after messages.
err := s.conn.InvalidateNode(id, off, size)
s.debug(notification{
Op: "InvalidateNode",
Node: id,
Out: invalidateNodeDetail{
Off: off,
Size: size,
},
Err: errstr(err),
})
return err
}
// InvalidateNodeAttr invalidates the kernel cache of the attributes
// of node.
//
// Returns fuse.ErrNotCached if the kernel is not currently caching
// the node.
func (s *Server) InvalidateNodeAttr(node Node) error {
return s.invalidateNode(node, 0, 0)
}
// InvalidateNodeData invalidates the kernel cache of the attributes
// and data of node.
//
// Returns fuse.ErrNotCached if the kernel is not currently caching
// the node.
func (s *Server) InvalidateNodeData(node Node) error {
return s.invalidateNode(node, 0, -1)
}
// InvalidateNodeDataRange invalidates the kernel cache of the
// attributes and a range of the data of node.
//
// Returns fuse.ErrNotCached if the kernel is not currently caching
// the node.
func (s *Server) InvalidateNodeDataRange(node Node, off int64, size int64) error {
return s.invalidateNode(node, off, size)
}
type invalidateEntryDetail struct {
Name string
}
func (i invalidateEntryDetail) String() string {
return fmt.Sprintf("%q", i.Name)
}
// InvalidateEntry invalidates the kernel cache of the directory entry
// identified by parent node and entry basename.
//
// Kernel may or may not cache directory listings. To invalidate
// those, use InvalidateNode to invalidate all of the data for a
// directory. (As of 2015-06, Linux FUSE does not cache directory
// listings.)
//
// Returns ErrNotCached if the kernel is not currently caching the
// node.
func (s *Server) InvalidateEntry(parent Node, name string) error {
s.meta.Lock()
id, ok := s.nodeRef[parent]
if ok {
snode := s.node[id]
snode.wg.Add(1)
defer snode.wg.Done()
}
s.meta.Unlock()
if !ok {
// This is what the kernel would have said, if we had been
// able to send this message; it's not cached.
return fuse.ErrNotCached
}
err := s.conn.InvalidateEntry(id, name)
s.debug(notification{
Op: "InvalidateEntry",
Node: id,
Out: invalidateEntryDetail{
Name: name,
},
Err: errstr(err),
})
return err
}
// DataHandle returns a read-only Handle that satisfies reads
// using the given data.
func DataHandle(data []byte) Handle {
return &dataHandle{data}
}
type dataHandle struct {
data []byte
}
func (d *dataHandle) ReadAll(ctx context.Context) ([]byte, error) {
return d.data, nil
}
// GenerateDynamicInode returns a dynamic inode.
//
// The parent inode and current entry name are used as the criteria
// for choosing a pseudorandom inode. This makes it likely the same
// entry will get the same inode on multiple runs.
func GenerateDynamicInode(parent uint64, name string) uint64 {
h := fnv.New64a()
var buf [8]byte
binary.LittleEndian.PutUint64(buf[:], parent)
_, _ = h.Write(buf[:])
_, _ = h.Write([]byte(name))
var inode uint64
for {
inode = h.Sum64()
if inode != 0 {
break
}
// there's a tiny probability that result is zero; change the
// input a little and try again
_, _ = h.Write([]byte{'x'})
}
return inode
}
// FUSE directory tree, for servers that wish to use it with the service loop.
package fs
import (
"os"
pathpkg "path"
"strings"
"golang.org/x/net/context"
)
import (
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
)
// A Tree implements a basic read-only directory tree for FUSE.
// The Nodes contained in it may still be writable.
type Tree struct {
tree
}
func (t *Tree) Root() (Node, error) {
return &t.tree, nil
}
// Add adds the path to the tree, resolving to the given node.
// If path or a prefix of path has already been added to the tree,
// Add panics.
//
// Add is only safe to call before starting to serve requests.
func (t *Tree) Add(path string, node Node) {
path = pathpkg.Clean("/" + path)[1:]
elems := strings.Split(path, "/")
dir := Node(&t.tree)
for i, elem := range elems {
dt, ok := dir.(*tree)
if !ok {
panic("fuse: Tree.Add for " + strings.Join(elems[:i], "/") + " and " + path)
}
n := dt.lookup(elem)
if n != nil {
if i+1 == len(elems) {
panic("fuse: Tree.Add for " + path + " conflicts with " + elem)
}
dir = n
} else {
if i+1 == len(elems) {
dt.add(elem, node)
} else {
dir = &tree{}
dt.add(elem, dir)
}
}
}
}
type treeDir struct {
name string
node Node
}
type tree struct {
dir []treeDir
}
func (t *tree) lookup(name string) Node {
for _, d := range t.dir {
if d.name == name {
return d.node
}
}
return nil
}
func (t *tree) add(name string, n Node) {
t.dir = append(t.dir, treeDir{name, n})
}
func (t *tree) Attr(ctx context.Context, a *fuse.Attr) error {
a.Mode = os.ModeDir | 0555
return nil
}
func (t *tree) Lookup(ctx context.Context, name string) (Node, error) {
n := t.lookup(name)
if n != nil {
return n, nil
}
return nil, fuse.ENOENT
}
func (t *tree) ReadDirAll(ctx context.Context) ([]fuse.Dirent, error) {
var out []fuse.Dirent
for _, d := range t.dir {
out = append(out, fuse.Dirent{Name: d.name})
}
return out, nil
}
// See the file LICENSE for copyright and licensing information.
// Adapted from Plan 9 from User Space's src/cmd/9pfuse/fuse.c,
// which carries this notice:
//
// The files in this directory are subject to the following license.
//
// The author of this software is Russ Cox.
//
// Copyright (c) 2006 Russ Cox
//
// Permission to use, copy, modify, and distribute this software for any
// purpose without fee is hereby granted, provided that this entire notice
// is included in all copies of any software which is or includes a copy
// or modification of this software and in all copies of the supporting
// documentation for such software.
//
// THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
// WARRANTY. IN PARTICULAR, THE AUTHOR MAKES NO REPRESENTATION OR WARRANTY
// OF ANY KIND CONCERNING THE MERCHANTABILITY OF THIS SOFTWARE OR ITS
// FITNESS FOR ANY PARTICULAR PURPOSE.
// Package fuse enables writing FUSE file systems on Linux, OS X, and FreeBSD.
//
// On OS X, it requires OSXFUSE (http://osxfuse.github.com/).
//
// There are two approaches to writing a FUSE file system. The first is to speak
// the low-level message protocol, reading from a Conn using ReadRequest and
// writing using the various Respond methods. This approach is closest to
// the actual interaction with the kernel and can be the simplest one in contexts
// such as protocol translators.
//
// Servers of synthesized file systems tend to share common
// bookkeeping abstracted away by the second approach, which is to
// call fs.Serve to serve the FUSE protocol using an implementation of
// the service methods in the interfaces FS* (file system), Node* (file
// or directory), and Handle* (opened file or directory).
// There are a daunting number of such methods that can be written,
// but few are required.
// The specific methods are described in the documentation for those interfaces.
//
// The hellofs subdirectory contains a simple illustration of the fs.Serve approach.
//
// Service Methods
//
// The required and optional methods for the FS, Node, and Handle interfaces
// have the general form
//
// Op(ctx context.Context, req *OpRequest, resp *OpResponse) error
//
// where Op is the name of a FUSE operation. Op reads request
// parameters from req and writes results to resp. An operation whose
// only result is the error result omits the resp parameter.
//
// Multiple goroutines may call service methods simultaneously; the
// methods being called are responsible for appropriate
// synchronization.
//
// The operation must not hold on to the request or response,
// including any []byte fields such as WriteRequest.Data or
// SetxattrRequest.Xattr.
//
// Errors
//
// Operations can return errors. The FUSE interface can only
// communicate POSIX errno error numbers to file system clients, the
// message is not visible to file system clients. The returned error
// can implement ErrorNumber to control the errno returned. Without
// ErrorNumber, a generic errno (EIO) is returned.
//
// Error messages will be visible in the debug log as part of the
// response.
//
// Interrupted Operations
//
// In some file systems, some operations
// may take an undetermined amount of time. For example, a Read waiting for
// a network message or a matching Write might wait indefinitely. If the request
// is cancelled and no longer needed, the context will be cancelled.
// Blocking operations should select on a receive from ctx.Done() and attempt to
// abort the operation early if the receive succeeds (meaning the channel is closed).
// To indicate that the operation failed because it was aborted, return fuse.EINTR.
//
// If an operation does not block for an indefinite amount of time, supporting
// cancellation is not necessary.
//
// Authentication
//
// All requests types embed a Header, meaning that the method can
// inspect req.Pid, req.Uid, and req.Gid as necessary to implement
// permission checking. The kernel FUSE layer normally prevents other
// users from accessing the FUSE file system (to change this, see
// AllowOther, AllowRoot), but does not enforce access modes (to
// change this, see DefaultPermissions).
//
// Mount Options
//
// Behavior and metadata of the mounted file system can be changed by
// passing MountOption values to Mount.
//
package fuse // import "github.com/cubefs/cubefs/depends/bazil.org/fuse"
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"os"
"sync"
"syscall"
"time"
"unsafe"
)
// A Conn represents a connection to a mounted FUSE file system.
type Conn struct {
// Ready is closed when the mount is complete or has failed.
Ready <-chan struct{}
// MountError stores any error from the mount process. Only valid
// after Ready is closed.
MountError error
// File handle for kernel communication. Only safe to access if
// rio or wio is held.
dev *os.File
wio sync.RWMutex
rio sync.RWMutex
// Protocol version negotiated with InitRequest/InitResponse.
proto Protocol
}
func (c *Conn) GetFuseDevFile() *os.File {
return c.dev
}
func (c *Conn) SetFuseDevFile(fud *os.File) {
c.dev = fud
}
// MountpointDoesNotExistError is an error returned when the
// mountpoint does not exist.
type MountpointDoesNotExistError struct {
Path string
}
var _ error = (*MountpointDoesNotExistError)(nil)
func (e *MountpointDoesNotExistError) Error() string {
return fmt.Sprintf("mountpoint does not exist: %v", e.Path)
}
// Mount mounts a new FUSE connection on the named directory
// and returns a connection for reading and writing FUSE messages.
//
// After a successful return, caller must call Close to free
// resources.
//
// Even on successful return, the new mount is not guaranteed to be
// visible until after Conn.Ready is closed. See Conn.MountError for
// possible errors. Incoming requests on Conn must be served to make
// progress.
func Mount(dir string, needRestoreFuse bool, options ...MountOption) (*Conn, error) {
conf := mountConfig{
options: make(map[string]string),
}
for _, option := range options {
if err := option(&conf); err != nil {
return nil, err
}
}
ready := make(chan struct{}, 1)
c := &Conn{
Ready: ready,
}
if !needRestoreFuse {
f, err := mount(dir, &conf, ready, &c.MountError)
if err != nil {
return nil, err
}
c.dev = f
if err := initMount(c, &conf); err != nil {
c.Close()
if err == ErrClosedWithoutInit {
// see if we can provide a better error
<-c.Ready
if err := c.MountError; err != nil {
return nil, err
}
}
return nil, err
}
} else {
close(ready)
// FIXME: save protocol version when saving context?
c.proto = Protocol{protoVersionMaxMajor, protoVersionMaxMinor}
}
InitReadBlockPool()
return c, nil
}
type OldVersionError struct {
Kernel Protocol
LibraryMin Protocol
}
func (e *OldVersionError) Error() string {
return fmt.Sprintf("kernel FUSE version is too old: %v < %v", e.Kernel, e.LibraryMin)
}
var (
ErrClosedWithoutInit = errors.New("fuse connection closed without init")
)
func initMount(c *Conn, conf *mountConfig) error {
req, err := c.ReadRequest()
if err != nil {
if err == io.EOF {
return ErrClosedWithoutInit
}
return err
}
r, ok := req.(*InitRequest)
if !ok {
return fmt.Errorf("missing init, got: %T", req)
}
min := Protocol{protoVersionMinMajor, protoVersionMinMinor}
if r.Kernel.LT(min) {
req.RespondError(Errno(syscall.EPROTO))
c.Close()
return &OldVersionError{
Kernel: r.Kernel,
LibraryMin: min,
}
}
proto := Protocol{protoVersionMaxMajor, protoVersionMaxMinor}
if r.Kernel.LT(proto) {
// Kernel doesn't support the latest version we have.
proto = r.Kernel
}
c.proto = proto
s := &InitResponse{
Library: proto,
MaxReadahead: conf.maxReadahead,
MaxWrite: maxWrite,
Flags: InitBigWrites | conf.initFlags,
}
r.Respond(s)
return nil
}
// A Request represents a single FUSE request received from the kernel.
// Use a type switch to determine the specific kind.
// A request of unrecognized type will have concrete type *Header.
type Request interface {
// Hdr returns the Header associated with this request.
Hdr() *Header
// RespondError responds to the request with the given error.
RespondError(error)
String() string
}
// A RequestID identifies an active FUSE request.
type RequestID uint64
func (r RequestID) String() string {
return fmt.Sprintf("%#x", uint64(r))
}
// A NodeID is a number identifying a directory or file.
// It must be unique among IDs returned in LookupResponses
// that have not yet been forgotten by ForgetRequests.
type NodeID uint64
func (n NodeID) String() string {
return fmt.Sprintf("%#x", uint64(n))
}
// A HandleID is a number identifying an open directory or file.
// It only needs to be unique while the directory or file is open.
type HandleID uint64
func (h HandleID) String() string {
return fmt.Sprintf("%#x", uint64(h))
}
// The RootID identifies the root directory of a FUSE file system.
const RootID NodeID = rootID
// A Header describes the basic information sent in every request.
type Header struct {
Conn *Conn `json:"-"` // connection this request was received on
ID RequestID // unique ID for request
Node NodeID // file or directory the request is about
Uid uint32 // user ID of process making request
Gid uint32 // group ID of process making request
Pid uint32 // process ID of process making request
// for returning to reqPool
msg *message
}
func (h *Header) String() string {
return fmt.Sprintf("ID=%v Node=%v Uid=%d Gid=%d Pid=%d", h.ID, h.Node, h.Uid, h.Gid, h.Pid)
}
func (h *Header) Hdr() *Header {
return h
}
func (h *Header) noResponse() {
putMessage(h.msg)
}
func (h *Header) respond(msg []byte) {
out := (*outHeader)(unsafe.Pointer(&msg[0]))
out.Unique = uint64(h.ID)
h.Conn.respond(msg)
putMessage(h.msg)
}
func (h *Header) respondDoNotReuseMsg(msg []byte) {
out := (*outHeader)(unsafe.Pointer(&msg[0]))
out.Unique = uint64(h.ID)
h.Conn.respond(msg)
}
// An ErrorNumber is an error with a specific error number.
//
// Operations may return an error value that implements ErrorNumber to
// control what specific error number (errno) to return.
type ErrorNumber interface {
// Errno returns the the error number (errno) for this error.
Errno() Errno
}
const (
// ENOSYS indicates that the call is not supported.
ENOSYS = Errno(syscall.ENOSYS)
// ESTALE is used by Serve to respond to violations of the FUSE protocol.
ESTALE = Errno(syscall.ESTALE)
ENOENT = Errno(syscall.ENOENT)
EIO = Errno(syscall.EIO)
EPERM = Errno(syscall.EPERM)
// EINTR indicates request was interrupted by an InterruptRequest.
// See also fs.Intr.
EINTR = Errno(syscall.EINTR)
ERANGE = Errno(syscall.ERANGE)
ENOTSUP = Errno(syscall.ENOTSUP)
EEXIST = Errno(syscall.EEXIST)
ETIME = Errno(syscall.ETIME)
ETIMEDOUT = Errno(syscall.ETIMEDOUT)
)
// DefaultErrno is the errno used when error returned does not
// implement ErrorNumber.
const DefaultErrno = EIO
var errnoNames = map[Errno]string{
ENOSYS: "ENOSYS",
ESTALE: "ESTALE",
ENOENT: "ENOENT",
EIO: "EIO",
EPERM: "EPERM",
EINTR: "EINTR",
EEXIST: "EEXIST",
ETIME: "ETIME",
ETIMEDOUT: "ETIMEDOUT",
}
// Errno implements Error and ErrorNumber using a syscall.Errno.
type Errno syscall.Errno
var _ = ErrorNumber(Errno(0))
var _ = error(Errno(0))
func (e Errno) Errno() Errno {
return e
}
func (e Errno) String() string {
return syscall.Errno(e).Error()
}
func (e Errno) Error() string {
return syscall.Errno(e).Error()
}
// ErrnoName returns the short non-numeric identifier for this errno.
// For example, "EIO".
func (e Errno) ErrnoName() string {
s := errnoNames[e]
if s == "" {
s = fmt.Sprint(e.Errno())
}
return s
}
func (e Errno) MarshalText() ([]byte, error) {
s := e.ErrnoName()
return []byte(s), nil
}
func (h *Header) RespondError(err error) {
errno := DefaultErrno
if ferr, ok := err.(ErrorNumber); ok {
errno = ferr.Errno()
}
// FUSE uses negative errors!
// TODO: File bug report against OSXFUSE: positive error causes kernel panic.
buf := newBuffer(0)
hOut := (*outHeader)(unsafe.Pointer(&buf[0]))
hOut.Error = -int32(errno)
h.respondDoNotReuseMsg(buf)
}
// All requests read from the kernel, without data, are shorter than
// this.
var maxRequestSize = syscall.Getpagesize()
var bufSize = maxRequestSize + maxWrite
// reqPool is a pool of messages.
//
// Lifetime of a logical message is from getMessage to putMessage.
// getMessage is called by ReadRequest. putMessage is called by
// Conn.ReadRequest, Request.Respond, or Request.RespondError.
//
// Messages in the pool are guaranteed to have conn and off zeroed,
// buf allocated and len==bufSize, and hdr set.
var reqPool = sync.Pool{
New: allocMessage,
}
func allocMessage() interface{} {
m := &message{buf: make([]byte, bufSize)}
m.hdr = (*inHeader)(unsafe.Pointer(&m.buf[0]))
return m
}
func getMessage(c *Conn) *message {
m := reqPool.Get().(*message)
m.conn = c
return m
}
func putMessage(m *message) {
m.buf = m.buf[:bufSize]
m.conn = nil
m.off = 0
reqPool.Put(m)
}
// a message represents the bytes of a single FUSE message
type message struct {
conn *Conn
buf []byte // all bytes
hdr *inHeader // header
off int // offset for reading additional fields
}
func (m *message) len() uintptr {
return uintptr(len(m.buf) - m.off)
}
func (m *message) data() unsafe.Pointer {
var p unsafe.Pointer
if m.off < len(m.buf) {
p = unsafe.Pointer(&m.buf[m.off])
}
return p
}
func (m *message) bytes() []byte {
return m.buf[m.off:]
}
func (m *message) Header() Header {
h := m.hdr
return Header{
Conn: m.conn,
ID: RequestID(h.Unique),
Node: NodeID(h.Nodeid),
Uid: h.Uid,
Gid: h.Gid,
Pid: h.Pid,
msg: m,
}
}
// fileMode returns a Go os.FileMode from a Unix mode.
func fileMode(unixMode uint32) os.FileMode {
mode := os.FileMode(unixMode & 0777)
switch unixMode & syscall.S_IFMT {
case syscall.S_IFREG:
// nothing
case syscall.S_IFDIR:
mode |= os.ModeDir
case syscall.S_IFCHR:
mode |= os.ModeCharDevice | os.ModeDevice
case syscall.S_IFBLK:
mode |= os.ModeDevice
case syscall.S_IFIFO:
mode |= os.ModeNamedPipe
case syscall.S_IFLNK:
mode |= os.ModeSymlink
case syscall.S_IFSOCK:
mode |= os.ModeSocket
default:
// no idea
mode |= os.ModeDevice
}
if unixMode&syscall.S_ISUID != 0 {
mode |= os.ModeSetuid
}
if unixMode&syscall.S_ISGID != 0 {
mode |= os.ModeSetgid
}
return mode
}
type noOpcode struct {
Opcode uint32
}
func (m noOpcode) String() string {
return fmt.Sprintf("No opcode %v", m.Opcode)
}
type malformedMessage struct {
}
func (malformedMessage) String() string {
return "malformed message"
}
// Close closes the FUSE connection.
func (c *Conn) Close() error {
c.wio.Lock()
defer c.wio.Unlock()
c.rio.Lock()
defer c.rio.Unlock()
return c.dev.Close()
}
// caller must hold wio or rio
func (c *Conn) fd() int {
return int(c.dev.Fd())
}
func (c *Conn) Protocol() Protocol {
return c.proto
}
// ReadRequest returns the next FUSE request from the kernel.
//
// Caller must call either Request.Respond or Request.RespondError in
// a reasonable time. Caller must not retain Request after that call.
func (c *Conn) ReadRequest() (Request, error) {
m := getMessage(c)
loop:
c.rio.RLock()
n, err := syscall.Read(c.fd(), m.buf)
c.rio.RUnlock()
if err == syscall.EINTR {
// OSXFUSE sends EINTR to userspace when a request interrupt
// completed before it got sent to userspace?
goto loop
}
if err != nil && err != syscall.ENODEV {
putMessage(m)
return nil, err
}
if n <= 0 {
putMessage(m)
return nil, io.EOF
}
m.buf = m.buf[:n]
if n < inHeaderSize {
putMessage(m)
return nil, errors.New("fuse: message too short")
}
// FreeBSD FUSE sends a short length in the header
// for FUSE_INIT even though the actual read length is correct.
if n == inHeaderSize+initInSize && m.hdr.Opcode == opInit && m.hdr.Len < uint32(n) {
m.hdr.Len = uint32(n)
}
// OSXFUSE sometimes sends the wrong m.hdr.Len in a FUSE_WRITE message.
if m.hdr.Len < uint32(n) && m.hdr.Len >= uint32(unsafe.Sizeof(writeIn{})) && m.hdr.Opcode == opWrite {
m.hdr.Len = uint32(n)
}
if m.hdr.Len != uint32(n) {
// prepare error message before returning m to pool
err := fmt.Errorf("fuse: read %d opcode %d but expected %d", n, m.hdr.Opcode, m.hdr.Len)
putMessage(m)
return nil, err
}
m.off = inHeaderSize
// Convert to data structures.
// Do not trust kernel to hand us well-formed data.
var req Request
switch m.hdr.Opcode {
default:
Debug(noOpcode{Opcode: m.hdr.Opcode})
goto unrecognized
case opLookup:
buf := m.bytes()
n := len(buf)
if n == 0 || buf[n-1] != '\x00' {
goto corrupt
}
req = &LookupRequest{
Header: m.Header(),
Name: string(buf[:n-1]),
}
case opForget:
in := (*forgetIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &ForgetRequest{
Header: m.Header(),
N: in.Nlookup,
}
case opGetattr:
switch {
case c.proto.LT(Protocol{7, 9}):
req = &GetattrRequest{
Header: m.Header(),
}
default:
in := (*getattrIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &GetattrRequest{
Header: m.Header(),
Flags: GetattrFlags(in.GetattrFlags),
Handle: HandleID(in.Fh),
}
}
case opSetattr:
in := (*setattrIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &SetattrRequest{
Header: m.Header(),
Valid: SetattrValid(in.Valid),
Handle: HandleID(in.Fh),
Size: in.Size,
Atime: time.Unix(int64(in.Atime), int64(in.AtimeNsec)),
Mtime: time.Unix(int64(in.Mtime), int64(in.MtimeNsec)),
Mode: fileMode(in.Mode),
Uid: in.Uid,
Gid: in.Gid,
Bkuptime: in.BkupTime(),
Chgtime: in.Chgtime(),
Flags: in.Flags(),
}
case opReadlink:
if len(m.bytes()) > 0 {
goto corrupt
}
req = &ReadlinkRequest{
Header: m.Header(),
}
case opSymlink:
// m.bytes() is "newName\0target\0"
names := m.bytes()
if len(names) == 0 || names[len(names)-1] != 0 {
goto corrupt
}
i := bytes.IndexByte(names, '\x00')
if i < 0 {
goto corrupt
}
newName, target := names[0:i], names[i+1:len(names)-1]
req = &SymlinkRequest{
Header: m.Header(),
NewName: string(newName),
Target: string(target),
}
case opLink:
in := (*linkIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
newName := m.bytes()[unsafe.Sizeof(*in):]
if len(newName) < 2 || newName[len(newName)-1] != 0 {
goto corrupt
}
newName = newName[:len(newName)-1]
req = &LinkRequest{
Header: m.Header(),
OldNode: NodeID(in.Oldnodeid),
NewName: string(newName),
}
case opMknod:
size := mknodInSize(c.proto)
if m.len() < size {
goto corrupt
}
in := (*mknodIn)(m.data())
name := m.bytes()[size:]
if len(name) < 2 || name[len(name)-1] != '\x00' {
goto corrupt
}
name = name[:len(name)-1]
r := &MknodRequest{
Header: m.Header(),
Mode: fileMode(in.Mode),
Rdev: in.Rdev,
Name: string(name),
}
if c.proto.GE(Protocol{7, 12}) {
r.Umask = fileMode(in.Umask) & os.ModePerm
}
req = r
case opMkdir:
size := mkdirInSize(c.proto)
if m.len() < size {
goto corrupt
}
in := (*mkdirIn)(m.data())
name := m.bytes()[size:]
i := bytes.IndexByte(name, '\x00')
if i < 0 {
goto corrupt
}
r := &MkdirRequest{
Header: m.Header(),
Name: string(name[:i]),
// observed on Linux: mkdirIn.Mode & syscall.S_IFMT == 0,
// and this causes fileMode to go into it's "no idea"
// code branch; enforce type to directory
Mode: fileMode((in.Mode &^ syscall.S_IFMT) | syscall.S_IFDIR),
}
if c.proto.GE(Protocol{7, 12}) {
r.Umask = fileMode(in.Umask) & os.ModePerm
}
req = r
case opUnlink, opRmdir:
buf := m.bytes()
n := len(buf)
if n == 0 || buf[n-1] != '\x00' {
goto corrupt
}
req = &RemoveRequest{
Header: m.Header(),
Name: string(buf[:n-1]),
Dir: m.hdr.Opcode == opRmdir,
}
case opRename:
in := (*renameIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
newDirNodeID := NodeID(in.Newdir)
oldNew := m.bytes()[unsafe.Sizeof(*in):]
// oldNew should be "old\x00new\x00"
if len(oldNew) < 4 {
goto corrupt
}
if oldNew[len(oldNew)-1] != '\x00' {
goto corrupt
}
i := bytes.IndexByte(oldNew, '\x00')
if i < 0 {
goto corrupt
}
oldName, newName := string(oldNew[:i]), string(oldNew[i+1:len(oldNew)-1])
req = &RenameRequest{
Header: m.Header(),
NewDir: newDirNodeID,
OldName: oldName,
NewName: newName,
}
case opOpendir, opOpen:
in := (*openIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &OpenRequest{
Header: m.Header(),
Dir: m.hdr.Opcode == opOpendir,
Flags: openFlags(in.Flags),
}
case opRead, opReaddir:
in := (*readIn)(m.data())
if m.len() < readInSize(c.proto) {
goto corrupt
}
r := &ReadRequest{
Header: m.Header(),
Dir: m.hdr.Opcode == opReaddir,
Handle: HandleID(in.Fh),
Offset: int64(in.Offset),
Size: int(in.Size),
}
if c.proto.GE(Protocol{7, 9}) {
r.Flags = ReadFlags(in.ReadFlags)
r.LockOwner = in.LockOwner
r.FileFlags = openFlags(in.Flags)
}
req = r
case opWrite:
in := (*writeIn)(m.data())
if m.len() < writeInSize(c.proto) {
goto corrupt
}
r := &WriteRequest{
Header: m.Header(),
Handle: HandleID(in.Fh),
Offset: int64(in.Offset),
Flags: WriteFlags(in.WriteFlags),
}
if c.proto.GE(Protocol{7, 9}) {
r.LockOwner = in.LockOwner
r.FileFlags = openFlags(in.Flags)
}
buf := m.bytes()[writeInSize(c.proto):]
if uint32(len(buf)) < in.Size {
goto corrupt
}
r.Data = buf
req = r
case opStatfs:
req = &StatfsRequest{
Header: m.Header(),
}
case opRelease, opReleasedir:
in := (*releaseIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &ReleaseRequest{
Header: m.Header(),
Dir: m.hdr.Opcode == opReleasedir,
Handle: HandleID(in.Fh),
Flags: openFlags(in.Flags),
ReleaseFlags: ReleaseFlags(in.ReleaseFlags),
LockOwner: in.LockOwner,
}
case opFsync, opFsyncdir:
in := (*fsyncIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &FsyncRequest{
Dir: m.hdr.Opcode == opFsyncdir,
Header: m.Header(),
Handle: HandleID(in.Fh),
Flags: in.FsyncFlags,
}
case opSetxattr:
in := (*setxattrIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
m.off += int(unsafe.Sizeof(*in))
name := m.bytes()
i := bytes.IndexByte(name, '\x00')
if i < 0 {
goto corrupt
}
xattr := name[i+1:]
if uint32(len(xattr)) < in.Size {
goto corrupt
}
xattr = xattr[:in.Size]
req = &SetxattrRequest{
Header: m.Header(),
Flags: in.Flags,
Position: in.position(),
Name: string(name[:i]),
Xattr: xattr,
}
case opGetxattr:
in := (*getxattrIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
name := m.bytes()[unsafe.Sizeof(*in):]
i := bytes.IndexByte(name, '\x00')
if i < 0 {
goto corrupt
}
req = &GetxattrRequest{
Header: m.Header(),
Name: string(name[:i]),
Size: in.Size,
Position: in.position(),
}
case opListxattr:
in := (*getxattrIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &ListxattrRequest{
Header: m.Header(),
Size: in.Size,
Position: in.position(),
}
case opRemovexattr:
buf := m.bytes()
n := len(buf)
if n == 0 || buf[n-1] != '\x00' {
goto corrupt
}
req = &RemovexattrRequest{
Header: m.Header(),
Name: string(buf[:n-1]),
}
case opFlush:
in := (*flushIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &FlushRequest{
Header: m.Header(),
Handle: HandleID(in.Fh),
Flags: in.FlushFlags,
LockOwner: in.LockOwner,
}
case opInit:
in := (*initIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &InitRequest{
Header: m.Header(),
Kernel: Protocol{in.Major, in.Minor},
MaxReadahead: in.MaxReadahead,
Flags: InitFlags(in.Flags),
}
case opGetlk:
panic("opGetlk")
case opSetlk:
panic("opSetlk")
case opSetlkw:
panic("opSetlkw")
case opAccess:
in := (*accessIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &AccessRequest{
Header: m.Header(),
Mask: in.Mask,
}
case opCreate:
size := createInSize(c.proto)
if m.len() < size {
goto corrupt
}
in := (*createIn)(m.data())
name := m.bytes()[size:]
i := bytes.IndexByte(name, '\x00')
if i < 0 {
goto corrupt
}
r := &CreateRequest{
Header: m.Header(),
Flags: openFlags(in.Flags),
Mode: fileMode(in.Mode),
Name: string(name[:i]),
}
if c.proto.GE(Protocol{7, 12}) {
r.Umask = fileMode(in.Umask) & os.ModePerm
}
req = r
case opInterrupt:
in := (*interruptIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
req = &InterruptRequest{
Header: m.Header(),
IntrID: RequestID(in.Unique),
}
case opBmap:
panic("opBmap")
case opDestroy:
req = &DestroyRequest{
Header: m.Header(),
}
// OS X
case opSetvolname:
panic("opSetvolname")
case opGetxtimes:
panic("opGetxtimes")
case opExchange:
in := (*exchangeIn)(m.data())
if m.len() < unsafe.Sizeof(*in) {
goto corrupt
}
oldDirNodeID := NodeID(in.Olddir)
newDirNodeID := NodeID(in.Newdir)
oldNew := m.bytes()[unsafe.Sizeof(*in):]
// oldNew should be "oldname\x00newname\x00"
if len(oldNew) < 4 {
goto corrupt
}
if oldNew[len(oldNew)-1] != '\x00' {
goto corrupt
}
i := bytes.IndexByte(oldNew, '\x00')
if i < 0 {
goto corrupt
}
oldName, newName := string(oldNew[:i]), string(oldNew[i+1:len(oldNew)-1])
req = &ExchangeDataRequest{
Header: m.Header(),
OldDir: oldDirNodeID,
NewDir: newDirNodeID,
OldName: oldName,
NewName: newName,
// TODO options
}
}
return req, nil
corrupt:
Debug(malformedMessage{})
putMessage(m)
return nil, fmt.Errorf("fuse: malformed message")
unrecognized:
// Unrecognized message.
// Assume higher-level code will send a "no idea what you mean" error.
h := m.Header()
return &h, nil
}
type bugShortKernelWrite struct {
Written int64
Length int64
Error string
Stack string
}
func (b bugShortKernelWrite) String() string {
return fmt.Sprintf("short kernel write: written=%d/%d error=%q stack=\n%s", b.Written, b.Length, b.Error, b.Stack)
}
type bugKernelWriteError struct {
Error string
Stack string
}
func (b bugKernelWriteError) String() string {
return fmt.Sprintf("kernel write error: error=%q stack=\n%s", b.Error, b.Stack)
}
// safe to call even with nil error
func errorString(err error) string {
if err == nil {
return ""
}
return err.Error()
}
func (c *Conn) writeToKernel(msg []byte) error {
out := (*outHeader)(unsafe.Pointer(&msg[0]))
out.Len = uint32(len(msg))
c.wio.RLock()
defer c.wio.RUnlock()
nn, err := syscall.Write(c.fd(), msg)
if err == nil && nn != len(msg) {
Debug(bugShortKernelWrite{
Written: int64(nn),
Length: int64(len(msg)),
Error: errorString(err),
Stack: stack(),
})
}
return err
}
func (c *Conn) respond(msg []byte) {
if err := c.writeToKernel(msg); err != nil {
Debug(bugKernelWriteError{
Error: errorString(err),
Stack: stack(),
})
}
}
type notCachedError struct{}
func (notCachedError) Error() string {
return "node not cached"
}
var _ ErrorNumber = notCachedError{}
func (notCachedError) Errno() Errno {
// Behave just like if the original syscall.ENOENT had been passed
// straight through.
return ENOENT
}
var (
ErrNotCached = notCachedError{}
)
// sendInvalidate sends an invalidate notification to kernel.
//
// A returned ENOENT is translated to a friendlier error.
func (c *Conn) sendInvalidate(msg []byte) error {
switch err := c.writeToKernel(msg); err {
case syscall.ENOENT:
return ErrNotCached
default:
return err
}
}
// InvalidateNode invalidates the kernel cache of the attributes and a
// range of the data of a node.
//
// Giving offset 0 and size -1 means all data. To invalidate just the
// attributes, give offset 0 and size 0.
//
// Returns ErrNotCached if the kernel is not currently caching the
// node.
func (c *Conn) InvalidateNode(nodeID NodeID, off int64, size int64) error {
buf := newBuffer(unsafe.Sizeof(notifyInvalInodeOut{}))
h := (*outHeader)(unsafe.Pointer(&buf[0]))
// h.Unique is 0
h.Error = notifyCodeInvalInode
out := (*notifyInvalInodeOut)(buf.alloc(unsafe.Sizeof(notifyInvalInodeOut{})))
out.Ino = uint64(nodeID)
out.Off = off
out.Len = size
return c.sendInvalidate(buf)
}
// InvalidateEntry invalidates the kernel cache of the directory entry
// identified by parent directory node ID and entry basename.
//
// Kernel may or may not cache directory listings. To invalidate
// those, use InvalidateNode to invalidate all of the data for a
// directory. (As of 2015-06, Linux FUSE does not cache directory
// listings.)
//
// Returns ErrNotCached if the kernel is not currently caching the
// node.
func (c *Conn) InvalidateEntry(parent NodeID, name string) error {
const maxUint32 = ^uint32(0)
if uint64(len(name)) > uint64(maxUint32) {
// very unlikely, but we don't want to silently truncate
return syscall.ENAMETOOLONG
}
buf := newBuffer(unsafe.Sizeof(notifyInvalEntryOut{}) + uintptr(len(name)) + 1)
h := (*outHeader)(unsafe.Pointer(&buf[0]))
// h.Unique is 0
h.Error = notifyCodeInvalEntry
out := (*notifyInvalEntryOut)(buf.alloc(unsafe.Sizeof(notifyInvalEntryOut{})))
out.Parent = uint64(parent)
out.Namelen = uint32(len(name))
buf = append(buf, name...)
buf = append(buf, '\x00')
return c.sendInvalidate(buf)
}
// An InitRequest is the first request sent on a FUSE file system.
type InitRequest struct {
Header `json:"-"`
Kernel Protocol
// Maximum readahead in bytes that the kernel plans to use.
MaxReadahead uint32
Flags InitFlags
}
var _ = Request(&InitRequest{})
func (r *InitRequest) String() string {
return fmt.Sprintf("Init [%v] %v ra=%d fl=%v", &r.Header, r.Kernel, r.MaxReadahead, r.Flags)
}
// An InitResponse is the response to an InitRequest.
type InitResponse struct {
Library Protocol
// Maximum readahead in bytes that the kernel can use. Ignored if
// greater than InitRequest.MaxReadahead.
MaxReadahead uint32
Flags InitFlags
// Maximum size of a single write operation.
// Linux enforces a minimum of 4 KiB.
MaxWrite uint32
}
func (r *InitResponse) String() string {
return fmt.Sprintf("Init %v ra=%d fl=%v w=%d", r.Library, r.MaxReadahead, r.Flags, r.MaxWrite)
}
// Respond replies to the request with the given response.
func (r *InitRequest) Respond(resp *InitResponse) {
buf := newBuffer(unsafe.Sizeof(initOut{}))
out := (*initOut)(buf.alloc(unsafe.Sizeof(initOut{})))
out.Major = resp.Library.Major
out.Minor = resp.Library.Minor
out.MaxReadahead = resp.MaxReadahead
out.Flags = uint32(resp.Flags)
out.MaxWrite = resp.MaxWrite
// MaxWrite larger than our receive buffer would just lead to
// errors on large writes.
if out.MaxWrite > maxWrite {
out.MaxWrite = maxWrite
}
r.respond(buf)
}
// A StatfsRequest requests information about the mounted file system.
type StatfsRequest struct {
Header `json:"-"`
}
var _ = Request(&StatfsRequest{})
func (r *StatfsRequest) String() string {
return fmt.Sprintf("Statfs [%s]", &r.Header)
}
// Respond replies to the request with the given response.
func (r *StatfsRequest) Respond(resp *StatfsResponse) {
buf := newBuffer(unsafe.Sizeof(statfsOut{}))
out := (*statfsOut)(buf.alloc(unsafe.Sizeof(statfsOut{})))
out.St = kstatfs{
Blocks: resp.Blocks,
Bfree: resp.Bfree,
Bavail: resp.Bavail,
Files: resp.Files,
Ffree: resp.Ffree,
Bsize: resp.Bsize,
Namelen: resp.Namelen,
Frsize: resp.Frsize,
}
r.respond(buf)
}
// A StatfsResponse is the response to a StatfsRequest.
type StatfsResponse struct {
Blocks uint64 // Total data blocks in file system.
Bfree uint64 // Free blocks in file system.
Bavail uint64 // Free blocks in file system if you're not root.
Files uint64 // Total files in file system.
Ffree uint64 // Free files in file system.
Bsize uint32 // Block size
Namelen uint32 // Maximum file name length?
Frsize uint32 // Fragment size, smallest addressable data size in the file system.
}
func (r *StatfsResponse) String() string {
return fmt.Sprintf("Statfs blocks=%d/%d/%d files=%d/%d bsize=%d frsize=%d namelen=%d",
r.Bavail, r.Bfree, r.Blocks,
r.Ffree, r.Files,
r.Bsize,
r.Frsize,
r.Namelen,
)
}
// An AccessRequest asks whether the file can be accessed
// for the purpose specified by the mask.
type AccessRequest struct {
Header `json:"-"`
Mask uint32
}
var _ = Request(&AccessRequest{})
func (r *AccessRequest) String() string {
return fmt.Sprintf("Access [%s] mask=%#x", &r.Header, r.Mask)
}
// Respond replies to the request indicating that access is allowed.
// To deny access, use RespondError.
func (r *AccessRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// An Attr is the metadata for a single file or directory.
type Attr struct {
Valid time.Duration // how long Attr can be cached
Inode uint64 // inode number
Size uint64 // size in bytes
Blocks uint64 // size in 512-byte units
Atime time.Time // time of last access
Mtime time.Time // time of last modification
Ctime time.Time // time of last inode change
Crtime time.Time // time of creation (OS X only)
Mode os.FileMode // file mode
Nlink uint32 // number of links (usually 1)
Uid uint32 // owner uid
Gid uint32 // group gid
Rdev uint32 // device numbers
Flags uint32 // chflags(2) flags (OS X only)
BlockSize uint32 // preferred blocksize for filesystem I/O
ParentIno uint64 // for cubefs's file only
}
func (a Attr) String() string {
return fmt.Sprintf("valid=%v ino=%v size=%d mode=%v", a.Valid, a.Inode, a.Size, a.Mode)
}
func unix(t time.Time) (sec uint64, nsec uint32) {
nano := t.UnixNano()
sec = uint64(nano / 1e9)
nsec = uint32(nano % 1e9)
return
}
func (a *Attr) attr(out *attr, proto Protocol) {
out.Ino = a.Inode
out.Size = a.Size
out.Blocks = a.Blocks
out.Atime, out.AtimeNsec = unix(a.Atime)
out.Mtime, out.MtimeNsec = unix(a.Mtime)
out.Ctime, out.CtimeNsec = unix(a.Ctime)
out.SetCrtime(unix(a.Crtime))
out.Mode = uint32(a.Mode) & 0777
switch {
default:
out.Mode |= syscall.S_IFREG
case a.Mode&os.ModeDir != 0:
out.Mode |= syscall.S_IFDIR
case a.Mode&os.ModeDevice != 0:
if a.Mode&os.ModeCharDevice != 0 {
out.Mode |= syscall.S_IFCHR
} else {
out.Mode |= syscall.S_IFBLK
}
case a.Mode&os.ModeNamedPipe != 0:
out.Mode |= syscall.S_IFIFO
case a.Mode&os.ModeSymlink != 0:
out.Mode |= syscall.S_IFLNK
case a.Mode&os.ModeSocket != 0:
out.Mode |= syscall.S_IFSOCK
}
if a.Mode&os.ModeSetuid != 0 {
out.Mode |= syscall.S_ISUID
}
if a.Mode&os.ModeSetgid != 0 {
out.Mode |= syscall.S_ISGID
}
out.Nlink = a.Nlink
out.Uid = a.Uid
out.Gid = a.Gid
out.Rdev = a.Rdev
out.SetFlags(a.Flags)
if proto.GE(Protocol{7, 9}) {
out.Blksize = a.BlockSize
}
return
}
// A GetattrRequest asks for the metadata for the file denoted by r.Node.
type GetattrRequest struct {
Header `json:"-"`
Flags GetattrFlags
Handle HandleID
}
var _ = Request(&GetattrRequest{})
func (r *GetattrRequest) String() string {
return fmt.Sprintf("Getattr [%s] %v fl=%v", &r.Header, r.Handle, r.Flags)
}
// Respond replies to the request with the given response.
func (r *GetattrRequest) Respond(resp *GetattrResponse) {
size := attrOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*attrOut)(buf.alloc(size))
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
// A GetattrResponse is the response to a GetattrRequest.
type GetattrResponse struct {
Attr Attr // file attributes
}
func (r *GetattrResponse) String() string {
return fmt.Sprintf("Getattr %v", r.Attr)
}
// A GetxattrRequest asks for the extended attributes associated with r.Node.
type GetxattrRequest struct {
Header `json:"-"`
// Maximum size to return.
Size uint32
// Name of the attribute requested.
Name string
// Offset within extended attributes.
//
// Only valid for OS X, and then only with the resource fork
// attribute.
Position uint32
}
var _ = Request(&GetxattrRequest{})
func (r *GetxattrRequest) String() string {
return fmt.Sprintf("Getxattr [%s] %q %d @%d", &r.Header, r.Name, r.Size, r.Position)
}
// Respond replies to the request with the given response.
func (r *GetxattrRequest) Respond(resp *GetxattrResponse) {
if r.Size == 0 {
buf := newBuffer(unsafe.Sizeof(getxattrOut{}))
out := (*getxattrOut)(buf.alloc(unsafe.Sizeof(getxattrOut{})))
out.Size = uint32(len(resp.Xattr))
r.respond(buf)
} else {
buf := newBuffer(uintptr(len(resp.Xattr)))
buf = append(buf, resp.Xattr...)
r.respond(buf)
}
}
// A GetxattrResponse is the response to a GetxattrRequest.
type GetxattrResponse struct {
Xattr []byte
}
func (r *GetxattrResponse) String() string {
return fmt.Sprintf("Getxattr %x", r.Xattr)
}
// A ListxattrRequest asks to list the extended attributes associated with r.Node.
type ListxattrRequest struct {
Header `json:"-"`
Size uint32 // maximum size to return
Position uint32 // offset within attribute list
}
var _ = Request(&ListxattrRequest{})
func (r *ListxattrRequest) String() string {
return fmt.Sprintf("Listxattr [%s] %d @%d", &r.Header, r.Size, r.Position)
}
// Respond replies to the request with the given response.
func (r *ListxattrRequest) Respond(resp *ListxattrResponse) {
if r.Size == 0 {
buf := newBuffer(unsafe.Sizeof(getxattrOut{}))
out := (*getxattrOut)(buf.alloc(unsafe.Sizeof(getxattrOut{})))
out.Size = uint32(len(resp.Xattr))
r.respond(buf)
} else {
buf := newBuffer(uintptr(len(resp.Xattr)))
buf = append(buf, resp.Xattr...)
r.respond(buf)
}
}
// A ListxattrResponse is the response to a ListxattrRequest.
type ListxattrResponse struct {
Xattr []byte
}
func (r *ListxattrResponse) String() string {
return fmt.Sprintf("Listxattr %x", r.Xattr)
}
// Append adds an extended attribute name to the response.
func (r *ListxattrResponse) Append(names ...string) {
for _, name := range names {
r.Xattr = append(r.Xattr, name...)
r.Xattr = append(r.Xattr, '\x00')
}
}
// A RemovexattrRequest asks to remove an extended attribute associated with r.Node.
type RemovexattrRequest struct {
Header `json:"-"`
Name string // name of extended attribute
}
var _ = Request(&RemovexattrRequest{})
func (r *RemovexattrRequest) String() string {
return fmt.Sprintf("Removexattr [%s] %q", &r.Header, r.Name)
}
// Respond replies to the request, indicating that the attribute was removed.
func (r *RemovexattrRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// A SetxattrRequest asks to set an extended attribute associated with a file.
type SetxattrRequest struct {
Header `json:"-"`
// Flags can make the request fail if attribute does/not already
// exist. Unfortunately, the constants are platform-specific and
// not exposed by Go1.2. Look for XATTR_CREATE, XATTR_REPLACE.
//
// TODO improve this later
//
// TODO XATTR_CREATE and exist -> EEXIST
//
// TODO XATTR_REPLACE and not exist -> ENODATA
Flags uint32
// Offset within extended attributes.
//
// Only valid for OS X, and then only with the resource fork
// attribute.
Position uint32
Name string
Xattr []byte
}
var _ = Request(&SetxattrRequest{})
func trunc(b []byte, max int) ([]byte, string) {
if len(b) > max {
return b[:max], "..."
}
return b, ""
}
func (r *SetxattrRequest) String() string {
xattr, tail := trunc(r.Xattr, 16)
return fmt.Sprintf("Setxattr [%s] %q %x%s fl=%v @%#x", &r.Header, r.Name, xattr, tail, r.Flags, r.Position)
}
// Respond replies to the request, indicating that the extended attribute was set.
func (r *SetxattrRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// A LookupRequest asks to look up the given name in the directory named by r.Node.
type LookupRequest struct {
Header `json:"-"`
Name string
}
var _ = Request(&LookupRequest{})
func (r *LookupRequest) String() string {
return fmt.Sprintf("Lookup [%s] %q", &r.Header, r.Name)
}
// Respond replies to the request with the given response.
func (r *LookupRequest) Respond(resp *LookupResponse) {
size := entryOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*entryOut)(buf.alloc(size))
out.Nodeid = uint64(resp.Node)
out.Generation = resp.Generation
out.EntryValid = uint64(resp.EntryValid / time.Second)
out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
// A LookupResponse is the response to a LookupRequest.
type LookupResponse struct {
Node NodeID
Generation uint64
EntryValid time.Duration
Attr Attr
}
func (r *LookupResponse) string() string {
return fmt.Sprintf("%v gen=%d valid=%v attr={%v}", r.Node, r.Generation, r.EntryValid, r.Attr)
}
func (r *LookupResponse) String() string {
return fmt.Sprintf("Lookup %s", r.string())
}
// An OpenRequest asks to open a file or directory
type OpenRequest struct {
Header `json:"-"`
Dir bool // is this Opendir?
Flags OpenFlags
}
var _ = Request(&OpenRequest{})
func (r *OpenRequest) String() string {
return fmt.Sprintf("Open [%s] dir=%v fl=%v", &r.Header, r.Dir, r.Flags)
}
// Respond replies to the request with the given response.
func (r *OpenRequest) Respond(resp *OpenResponse) {
buf := newBuffer(unsafe.Sizeof(openOut{}))
out := (*openOut)(buf.alloc(unsafe.Sizeof(openOut{})))
out.Fh = uint64(resp.Handle)
out.OpenFlags = uint32(resp.Flags)
r.respond(buf)
}
// A OpenResponse is the response to a OpenRequest.
type OpenResponse struct {
Handle HandleID
Flags OpenResponseFlags
}
func (r *OpenResponse) string() string {
return fmt.Sprintf("%v fl=%v", r.Handle, r.Flags)
}
func (r *OpenResponse) String() string {
return fmt.Sprintf("Open %s", r.string())
}
// A CreateRequest asks to create and open a file (not a directory).
type CreateRequest struct {
Header `json:"-"`
Name string
Flags OpenFlags
Mode os.FileMode
// Umask of the request. Not supported on OS X.
Umask os.FileMode
}
var _ = Request(&CreateRequest{})
func (r *CreateRequest) String() string {
return fmt.Sprintf("Create [%s] %q fl=%v mode=%v umask=%v", &r.Header, r.Name, r.Flags, r.Mode, r.Umask)
}
// Respond replies to the request with the given response.
func (r *CreateRequest) Respond(resp *CreateResponse) {
eSize := entryOutSize(r.Header.Conn.proto)
buf := newBuffer(eSize + unsafe.Sizeof(openOut{}))
e := (*entryOut)(buf.alloc(eSize))
e.Nodeid = uint64(resp.Node)
e.Generation = resp.Generation
e.EntryValid = uint64(resp.EntryValid / time.Second)
e.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
e.AttrValid = uint64(resp.Attr.Valid / time.Second)
e.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&e.Attr, r.Header.Conn.proto)
o := (*openOut)(buf.alloc(unsafe.Sizeof(openOut{})))
o.Fh = uint64(resp.Handle)
o.OpenFlags = uint32(resp.Flags)
r.respond(buf)
}
// A CreateResponse is the response to a CreateRequest.
// It describes the created node and opened handle.
type CreateResponse struct {
LookupResponse
OpenResponse
}
func (r *CreateResponse) String() string {
return fmt.Sprintf("Create {%s} {%s}", r.LookupResponse.string(), r.OpenResponse.string())
}
// A MkdirRequest asks to create (but not open) a directory.
type MkdirRequest struct {
Header `json:"-"`
Name string
Mode os.FileMode
// Umask of the request. Not supported on OS X.
Umask os.FileMode
}
var _ = Request(&MkdirRequest{})
func (r *MkdirRequest) String() string {
return fmt.Sprintf("Mkdir [%s] %q mode=%v umask=%v", &r.Header, r.Name, r.Mode, r.Umask)
}
// Respond replies to the request with the given response.
func (r *MkdirRequest) Respond(resp *MkdirResponse) {
size := entryOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*entryOut)(buf.alloc(size))
out.Nodeid = uint64(resp.Node)
out.Generation = resp.Generation
out.EntryValid = uint64(resp.EntryValid / time.Second)
out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
// A MkdirResponse is the response to a MkdirRequest.
type MkdirResponse struct {
LookupResponse
}
func (r *MkdirResponse) String() string {
return fmt.Sprintf("Mkdir %v", r.LookupResponse.string())
}
// A ReadRequest asks to read from an open file.
type ReadRequest struct {
Header `json:"-"`
Dir bool // is this Readdir?
Handle HandleID
Offset int64
Size int
Flags ReadFlags
LockOwner uint64
FileFlags OpenFlags
}
var _ = Request(&ReadRequest{})
func (r *ReadRequest) String() string {
return fmt.Sprintf("Read [%s] %v %d @%#x dir=%v fl=%v lock=%d ffl=%v", &r.Header, r.Handle, r.Size, r.Offset, r.Dir, r.Flags, r.LockOwner, r.FileFlags)
}
// Respond replies to the request with the given response.
func (r *ReadRequest) Respond(resp *ReadResponse) {
if r.Dir {
buf := newBuffer(uintptr(len(resp.Data)))
buf = append(buf, resp.Data...)
r.respond(buf)
} else {
r.respond(resp.Data)
PutBlockBuf(resp.Data)
}
}
// A ReadResponse is the response to a ReadRequest.
type ReadResponse struct {
Data []byte
}
func (r *ReadResponse) String() string {
return fmt.Sprintf("Read %d", len(r.Data))
}
type jsonReadResponse struct {
Len uint64
}
func (r *ReadResponse) MarshalJSON() ([]byte, error) {
j := jsonReadResponse{
Len: uint64(len(r.Data)),
}
return json.Marshal(j)
}
// A ReleaseRequest asks to release (close) an open file handle.
type ReleaseRequest struct {
Header `json:"-"`
Dir bool // is this Releasedir?
Handle HandleID
Flags OpenFlags // flags from OpenRequest
ReleaseFlags ReleaseFlags
LockOwner uint32
}
var _ = Request(&ReleaseRequest{})
func (r *ReleaseRequest) String() string {
return fmt.Sprintf("Release [%s] %v fl=%v rfl=%v owner=%#x", &r.Header, r.Handle, r.Flags, r.ReleaseFlags, r.LockOwner)
}
// Respond replies to the request, indicating that the handle has been released.
func (r *ReleaseRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// A DestroyRequest is sent by the kernel when unmounting the file system.
// No more requests will be received after this one, but it should still be
// responded to.
type DestroyRequest struct {
Header `json:"-"`
}
var _ = Request(&DestroyRequest{})
func (r *DestroyRequest) String() string {
return fmt.Sprintf("Destroy [%s]", &r.Header)
}
// Respond replies to the request.
func (r *DestroyRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// A ForgetRequest is sent by the kernel when forgetting about r.Node
// as returned by r.N lookup requests.
type ForgetRequest struct {
Header `json:"-"`
N uint64
}
var _ = Request(&ForgetRequest{})
func (r *ForgetRequest) String() string {
return fmt.Sprintf("Forget [%s] %d", &r.Header, r.N)
}
// Respond replies to the request, indicating that the forgetfulness has been recorded.
func (r *ForgetRequest) Respond() {
// Don't reply to forget messages.
r.noResponse()
}
// A Dirent represents a single directory entry.
type Dirent struct {
// Inode this entry names.
Inode uint64
// Type of the entry, for example DT_File.
//
// Setting this is optional. The zero value (DT_Unknown) means
// callers will just need to do a Getattr when the type is
// needed. Providing a type can speed up operations
// significantly.
Type DirentType
// Name of the entry
Name string
}
// Type of an entry in a directory listing.
type DirentType uint32
const (
// These don't quite match os.FileMode; especially there's an
// explicit unknown, instead of zero value meaning file. They
// are also not quite syscall.DT_*; nothing says the FUSE
// protocol follows those, and even if they were, we don't
// want each fs to fiddle with syscall.
// The shift by 12 is hardcoded in the FUSE userspace
// low-level C library, so it's safe here.
DT_Unknown DirentType = 0
DT_Socket DirentType = syscall.S_IFSOCK >> 12
DT_Link DirentType = syscall.S_IFLNK >> 12
DT_File DirentType = syscall.S_IFREG >> 12
DT_Block DirentType = syscall.S_IFBLK >> 12
DT_Dir DirentType = syscall.S_IFDIR >> 12
DT_Char DirentType = syscall.S_IFCHR >> 12
DT_FIFO DirentType = syscall.S_IFIFO >> 12
)
func (t DirentType) String() string {
switch t {
case DT_Unknown:
return "unknown"
case DT_Socket:
return "socket"
case DT_Link:
return "link"
case DT_File:
return "file"
case DT_Block:
return "block"
case DT_Dir:
return "dir"
case DT_Char:
return "char"
case DT_FIFO:
return "fifo"
}
return "invalid"
}
// AppendDirent appends the encoded form of a directory entry to data
// and returns the resulting slice.
func AppendDirent(data []byte, dir Dirent) []byte {
de := dirent{
Ino: dir.Inode,
Namelen: uint32(len(dir.Name)),
Type: uint32(dir.Type),
}
de.Off = uint64(len(data) + direntSize + (len(dir.Name)+7)&^7)
data = append(data, (*[direntSize]byte)(unsafe.Pointer(&de))[:]...)
data = append(data, dir.Name...)
n := direntSize + uintptr(len(dir.Name))
if n%8 != 0 {
var pad [8]byte
data = append(data, pad[:8-n%8]...)
}
return data
}
// A WriteRequest asks to write to an open file.
type WriteRequest struct {
Header
Handle HandleID
Offset int64
Data []byte
Flags WriteFlags
LockOwner uint64
FileFlags OpenFlags
}
var _ = Request(&WriteRequest{})
func (r *WriteRequest) String() string {
return fmt.Sprintf("Write [%s] %v %d @%d fl=%v lock=%d ffl=%v", &r.Header, r.Handle, len(r.Data), r.Offset, r.Flags, r.LockOwner, r.FileFlags)
}
type jsonWriteRequest struct {
Handle HandleID
Offset int64
Len uint64
Flags WriteFlags
}
func (r *WriteRequest) MarshalJSON() ([]byte, error) {
j := jsonWriteRequest{
Handle: r.Handle,
Offset: r.Offset,
Len: uint64(len(r.Data)),
Flags: r.Flags,
}
return json.Marshal(j)
}
// Respond replies to the request with the given response.
func (r *WriteRequest) Respond(resp *WriteResponse) {
buf := newBuffer(unsafe.Sizeof(writeOut{}))
out := (*writeOut)(buf.alloc(unsafe.Sizeof(writeOut{})))
out.Size = uint32(resp.Size)
r.respond(buf)
}
// A WriteResponse replies to a write indicating how many bytes were written.
type WriteResponse struct {
Size int
}
func (r *WriteResponse) String() string {
return fmt.Sprintf("Write %d", r.Size)
}
// A SetattrRequest asks to change one or more attributes associated with a file,
// as indicated by Valid.
type SetattrRequest struct {
Header `json:"-"`
Valid SetattrValid
Handle HandleID
Size uint64
Atime time.Time
Mtime time.Time
Mode os.FileMode
Uid uint32
Gid uint32
// OS X only
Bkuptime time.Time
Chgtime time.Time
Crtime time.Time
Flags uint32 // see chflags(2)
}
var _ = Request(&SetattrRequest{})
func (r *SetattrRequest) String() string {
var buf bytes.Buffer
fmt.Fprintf(&buf, "Setattr [%s]", &r.Header)
if r.Valid.Mode() {
fmt.Fprintf(&buf, " mode=%v", r.Mode)
}
if r.Valid.Uid() {
fmt.Fprintf(&buf, " uid=%d", r.Uid)
}
if r.Valid.Gid() {
fmt.Fprintf(&buf, " gid=%d", r.Gid)
}
if r.Valid.Size() {
fmt.Fprintf(&buf, " size=%d", r.Size)
}
if r.Valid.Atime() {
fmt.Fprintf(&buf, " atime=%v", r.Atime)
}
if r.Valid.AtimeNow() {
fmt.Fprintf(&buf, " atime=now")
}
if r.Valid.Mtime() {
fmt.Fprintf(&buf, " mtime=%v", r.Mtime)
}
if r.Valid.MtimeNow() {
fmt.Fprintf(&buf, " mtime=now")
}
if r.Valid.Handle() {
fmt.Fprintf(&buf, " handle=%v", r.Handle)
} else {
fmt.Fprintf(&buf, " handle=INVALID-%v", r.Handle)
}
if r.Valid.LockOwner() {
fmt.Fprintf(&buf, " lockowner")
}
if r.Valid.Crtime() {
fmt.Fprintf(&buf, " crtime=%v", r.Crtime)
}
if r.Valid.Chgtime() {
fmt.Fprintf(&buf, " chgtime=%v", r.Chgtime)
}
if r.Valid.Bkuptime() {
fmt.Fprintf(&buf, " bkuptime=%v", r.Bkuptime)
}
if r.Valid.Flags() {
fmt.Fprintf(&buf, " flags=%v", r.Flags)
}
return buf.String()
}
// Respond replies to the request with the given response,
// giving the updated attributes.
func (r *SetattrRequest) Respond(resp *SetattrResponse) {
size := attrOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*attrOut)(buf.alloc(size))
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
// A SetattrResponse is the response to a SetattrRequest.
type SetattrResponse struct {
Attr Attr // file attributes
}
func (r *SetattrResponse) String() string {
return fmt.Sprintf("Setattr %v", r.Attr)
}
// A FlushRequest asks for the current state of an open file to be flushed
// to storage, as when a file descriptor is being closed. A single opened Handle
// may receive multiple FlushRequests over its lifetime.
type FlushRequest struct {
Header `json:"-"`
Handle HandleID
Flags uint32
LockOwner uint64
}
var _ = Request(&FlushRequest{})
func (r *FlushRequest) String() string {
return fmt.Sprintf("Flush [%s] %v fl=%#x lk=%#x", &r.Header, r.Handle, r.Flags, r.LockOwner)
}
// Respond replies to the request, indicating that the flush succeeded.
func (r *FlushRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// A RemoveRequest asks to remove a file or directory from the
// directory r.Node.
type RemoveRequest struct {
Header `json:"-"`
Name string // name of the entry to remove
Dir bool // is this rmdir?
}
var _ = Request(&RemoveRequest{})
func (r *RemoveRequest) String() string {
return fmt.Sprintf("Remove [%s] %q dir=%v", &r.Header, r.Name, r.Dir)
}
// Respond replies to the request, indicating that the file was removed.
func (r *RemoveRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// A SymlinkRequest is a request to create a symlink making NewName point to Target.
type SymlinkRequest struct {
Header `json:"-"`
NewName, Target string
}
var _ = Request(&SymlinkRequest{})
func (r *SymlinkRequest) String() string {
return fmt.Sprintf("Symlink [%s] from %q to target %q", &r.Header, r.NewName, r.Target)
}
// Respond replies to the request, indicating that the symlink was created.
func (r *SymlinkRequest) Respond(resp *SymlinkResponse) {
size := entryOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*entryOut)(buf.alloc(size))
out.Nodeid = uint64(resp.Node)
out.Generation = resp.Generation
out.EntryValid = uint64(resp.EntryValid / time.Second)
out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
// A SymlinkResponse is the response to a SymlinkRequest.
type SymlinkResponse struct {
LookupResponse
}
func (r *SymlinkResponse) String() string {
return fmt.Sprintf("Symlink %v", r.LookupResponse.string())
}
// A ReadlinkRequest is a request to read a symlink's target.
type ReadlinkRequest struct {
Header `json:"-"`
}
var _ = Request(&ReadlinkRequest{})
func (r *ReadlinkRequest) String() string {
return fmt.Sprintf("Readlink [%s]", &r.Header)
}
func (r *ReadlinkRequest) Respond(target string) {
buf := newBuffer(uintptr(len(target)))
buf = append(buf, target...)
r.respond(buf)
}
// A LinkRequest is a request to create a hard link.
type LinkRequest struct {
Header `json:"-"`
OldNode NodeID
NewName string
}
var _ = Request(&LinkRequest{})
func (r *LinkRequest) String() string {
return fmt.Sprintf("Link [%s] node %d to %q", &r.Header, r.OldNode, r.NewName)
}
func (r *LinkRequest) Respond(resp *LookupResponse) {
size := entryOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*entryOut)(buf.alloc(size))
out.Nodeid = uint64(resp.Node)
out.Generation = resp.Generation
out.EntryValid = uint64(resp.EntryValid / time.Second)
out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
// A RenameRequest is a request to rename a file.
type RenameRequest struct {
Header `json:"-"`
NewDir NodeID
OldName, NewName string
}
var _ = Request(&RenameRequest{})
func (r *RenameRequest) String() string {
return fmt.Sprintf("Rename [%s] from %q to dirnode %v %q", &r.Header, r.OldName, r.NewDir, r.NewName)
}
func (r *RenameRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
type MknodRequest struct {
Header `json:"-"`
Name string
Mode os.FileMode
Rdev uint32
// Umask of the request. Not supported on OS X.
Umask os.FileMode
}
var _ = Request(&MknodRequest{})
func (r *MknodRequest) String() string {
return fmt.Sprintf("Mknod [%s] Name %q mode=%v umask=%v rdev=%d", &r.Header, r.Name, r.Mode, r.Umask, r.Rdev)
}
func (r *MknodRequest) Respond(resp *LookupResponse) {
size := entryOutSize(r.Header.Conn.proto)
buf := newBuffer(size)
out := (*entryOut)(buf.alloc(size))
out.Nodeid = uint64(resp.Node)
out.Generation = resp.Generation
out.EntryValid = uint64(resp.EntryValid / time.Second)
out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
out.AttrValid = uint64(resp.Attr.Valid / time.Second)
out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
r.respond(buf)
}
type FsyncRequest struct {
Header `json:"-"`
Handle HandleID
// TODO bit 1 is datasync, not well documented upstream
Flags uint32
Dir bool
}
var _ = Request(&FsyncRequest{})
func (r *FsyncRequest) String() string {
return fmt.Sprintf("Fsync [%s] Handle %v Flags %v", &r.Header, r.Handle, r.Flags)
}
func (r *FsyncRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// An InterruptRequest is a request to interrupt another pending request. The
// response to that request should return an error status of EINTR.
type InterruptRequest struct {
Header `json:"-"`
IntrID RequestID // ID of the request to be interrupt.
}
var _ = Request(&InterruptRequest{})
func (r *InterruptRequest) Respond() {
// nothing to do here
r.noResponse()
}
func (r *InterruptRequest) String() string {
return fmt.Sprintf("Interrupt [%s] ID %v", &r.Header, r.IntrID)
}
// An ExchangeDataRequest is a request to exchange the contents of two
// files, while leaving most metadata untouched.
//
// This request comes from OS X exchangedata(2) and represents its
// specific semantics. Crucially, it is very different from Linux
// renameat(2) RENAME_EXCHANGE.
//
// https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man2/exchangedata.2.html
type ExchangeDataRequest struct {
Header `json:"-"`
OldDir, NewDir NodeID
OldName, NewName string
// TODO options
}
var _ = Request(&ExchangeDataRequest{})
func (r *ExchangeDataRequest) String() string {
// TODO options
return fmt.Sprintf("ExchangeData [%s] %v %q and %v %q", &r.Header, r.OldDir, r.OldName, r.NewDir, r.NewName)
}
func (r *ExchangeDataRequest) Respond() {
buf := newBuffer(0)
r.respond(buf)
}
// See the file LICENSE for copyright and licensing information.
// Derived from FUSE's fuse_kernel.h, which carries this notice:
/*
This file defines the kernel interface of FUSE
Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
This -- and only this -- header file may also be distributed under
the terms of the BSD Licence as follows:
Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
*/
package fuse
import (
"fmt"
"syscall"
"unsafe"
)
// The FUSE version implemented by the package.
const (
protoVersionMinMajor = 7
protoVersionMinMinor = 8
protoVersionMaxMajor = 7
protoVersionMaxMinor = 12
)
const (
rootID = 1
)
type kstatfs struct {
Blocks uint64
Bfree uint64
Bavail uint64
Files uint64
Ffree uint64
Bsize uint32
Namelen uint32
Frsize uint32
_ uint32
Spare [6]uint32
}
type fileLock struct {
Start uint64
End uint64
Type uint32
Pid uint32
}
// GetattrFlags are bit flags that can be seen in GetattrRequest.
type GetattrFlags uint32
const (
// Indicates the handle is valid.
GetattrFh GetattrFlags = 1 << 0
)
var getattrFlagsNames = []flagName{
{uint32(GetattrFh), "GetattrFh"},
}
func (fl GetattrFlags) String() string {
return flagString(uint32(fl), getattrFlagsNames)
}
// The SetattrValid are bit flags describing which fields in the SetattrRequest
// are included in the change.
type SetattrValid uint32
const (
SetattrMode SetattrValid = 1 << 0
SetattrUid SetattrValid = 1 << 1
SetattrGid SetattrValid = 1 << 2
SetattrSize SetattrValid = 1 << 3
SetattrAtime SetattrValid = 1 << 4
SetattrMtime SetattrValid = 1 << 5
SetattrHandle SetattrValid = 1 << 6
// Linux only(?)
SetattrAtimeNow SetattrValid = 1 << 7
SetattrMtimeNow SetattrValid = 1 << 8
SetattrLockOwner SetattrValid = 1 << 9 // http://www.mail-archive.com/git-commits-head@vger.kernel.org/msg27852.html
// OS X only
SetattrCrtime SetattrValid = 1 << 28
SetattrChgtime SetattrValid = 1 << 29
SetattrBkuptime SetattrValid = 1 << 30
SetattrFlags SetattrValid = 1 << 31
)
func (fl SetattrValid) Mode() bool { return fl&SetattrMode != 0 }
func (fl SetattrValid) Uid() bool { return fl&SetattrUid != 0 }
func (fl SetattrValid) Gid() bool { return fl&SetattrGid != 0 }
func (fl SetattrValid) Size() bool { return fl&SetattrSize != 0 }
func (fl SetattrValid) Atime() bool { return fl&SetattrAtime != 0 }
func (fl SetattrValid) Mtime() bool { return fl&SetattrMtime != 0 }
func (fl SetattrValid) Handle() bool { return fl&SetattrHandle != 0 }
func (fl SetattrValid) AtimeNow() bool { return fl&SetattrAtimeNow != 0 }
func (fl SetattrValid) MtimeNow() bool { return fl&SetattrMtimeNow != 0 }
func (fl SetattrValid) LockOwner() bool { return fl&SetattrLockOwner != 0 }
func (fl SetattrValid) Crtime() bool { return fl&SetattrCrtime != 0 }
func (fl SetattrValid) Chgtime() bool { return fl&SetattrChgtime != 0 }
func (fl SetattrValid) Bkuptime() bool { return fl&SetattrBkuptime != 0 }
func (fl SetattrValid) Flags() bool { return fl&SetattrFlags != 0 }
func (fl SetattrValid) String() string {
return flagString(uint32(fl), setattrValidNames)
}
var setattrValidNames = []flagName{
{uint32(SetattrMode), "SetattrMode"},
{uint32(SetattrUid), "SetattrUid"},
{uint32(SetattrGid), "SetattrGid"},
{uint32(SetattrSize), "SetattrSize"},
{uint32(SetattrAtime), "SetattrAtime"},
{uint32(SetattrMtime), "SetattrMtime"},
{uint32(SetattrHandle), "SetattrHandle"},
{uint32(SetattrAtimeNow), "SetattrAtimeNow"},
{uint32(SetattrMtimeNow), "SetattrMtimeNow"},
{uint32(SetattrLockOwner), "SetattrLockOwner"},
{uint32(SetattrCrtime), "SetattrCrtime"},
{uint32(SetattrChgtime), "SetattrChgtime"},
{uint32(SetattrBkuptime), "SetattrBkuptime"},
{uint32(SetattrFlags), "SetattrFlags"},
}
// Flags that can be seen in OpenRequest.Flags.
const (
// Access modes. These are not 1-bit flags, but alternatives where
// only one can be chosen. See the IsReadOnly etc convenience
// methods.
OpenReadOnly OpenFlags = syscall.O_RDONLY
OpenWriteOnly OpenFlags = syscall.O_WRONLY
OpenReadWrite OpenFlags = syscall.O_RDWR
// File was opened in append-only mode, all writes will go to end
// of file. OS X does not provide this information.
OpenAppend OpenFlags = syscall.O_APPEND
OpenCreate OpenFlags = syscall.O_CREAT
OpenDirectory OpenFlags = syscall.O_DIRECTORY
OpenExclusive OpenFlags = syscall.O_EXCL
OpenNonblock OpenFlags = syscall.O_NONBLOCK
OpenSync OpenFlags = syscall.O_SYNC
OpenTruncate OpenFlags = syscall.O_TRUNC
)
// OpenAccessModeMask is a bitmask that separates the access mode
// from the other flags in OpenFlags.
const OpenAccessModeMask OpenFlags = syscall.O_ACCMODE
// OpenFlags are the O_FOO flags passed to open/create/etc calls. For
// example, os.O_WRONLY | os.O_APPEND.
type OpenFlags uint32
func (fl OpenFlags) String() string {
// O_RDONLY, O_RWONLY, O_RDWR are not flags
s := accModeName(fl & OpenAccessModeMask)
flags := uint32(fl &^ OpenAccessModeMask)
if flags != 0 {
s = s + "+" + flagString(flags, openFlagNames)
}
return s
}
// Return true if OpenReadOnly is set.
func (fl OpenFlags) IsReadOnly() bool {
return fl&OpenAccessModeMask == OpenReadOnly
}
// Return true if OpenWriteOnly is set.
func (fl OpenFlags) IsWriteOnly() bool {
return fl&OpenAccessModeMask == OpenWriteOnly
}
// Return true if OpenReadWrite is set.
func (fl OpenFlags) IsReadWrite() bool {
return fl&OpenAccessModeMask == OpenReadWrite
}
func accModeName(flags OpenFlags) string {
switch flags {
case OpenReadOnly:
return "OpenReadOnly"
case OpenWriteOnly:
return "OpenWriteOnly"
case OpenReadWrite:
return "OpenReadWrite"
default:
return ""
}
}
var openFlagNames = []flagName{
{uint32(OpenAppend), "OpenAppend"},
{uint32(OpenCreate), "OpenCreate"},
{uint32(OpenDirectory), "OpenDirectory"},
{uint32(OpenExclusive), "OpenExclusive"},
{uint32(OpenNonblock), "OpenNonblock"},
{uint32(OpenSync), "OpenSync"},
{uint32(OpenTruncate), "OpenTruncate"},
}
// The OpenResponseFlags are returned in the OpenResponse.
type OpenResponseFlags uint32
const (
OpenDirectIO OpenResponseFlags = 1 << 0 // bypass page cache for this open file
OpenKeepCache OpenResponseFlags = 1 << 1 // don't invalidate the data cache on open
OpenNonSeekable OpenResponseFlags = 1 << 2 // mark the file as non-seekable (not supported on OS X)
OpenPurgeAttr OpenResponseFlags = 1 << 30 // OS X
OpenPurgeUBC OpenResponseFlags = 1 << 31 // OS X
)
func (fl OpenResponseFlags) String() string {
return flagString(uint32(fl), openResponseFlagNames)
}
var openResponseFlagNames = []flagName{
{uint32(OpenDirectIO), "OpenDirectIO"},
{uint32(OpenKeepCache), "OpenKeepCache"},
{uint32(OpenNonSeekable), "OpenNonSeekable"},
{uint32(OpenPurgeAttr), "OpenPurgeAttr"},
{uint32(OpenPurgeUBC), "OpenPurgeUBC"},
}
// The InitFlags are used in the Init exchange.
type InitFlags uint32
const (
InitAsyncRead InitFlags = 1 << 0
InitPosixLocks InitFlags = 1 << 1
InitFileOps InitFlags = 1 << 2
InitAtomicTrunc InitFlags = 1 << 3
InitExportSupport InitFlags = 1 << 4
InitBigWrites InitFlags = 1 << 5
// Do not mask file access modes with umask. Not supported on OS X.
InitDontMask InitFlags = 1 << 6
InitSpliceWrite InitFlags = 1 << 7
InitSpliceMove InitFlags = 1 << 8
InitSpliceRead InitFlags = 1 << 9
InitFlockLocks InitFlags = 1 << 10
InitHasIoctlDir InitFlags = 1 << 11
InitAutoInvalData InitFlags = 1 << 12
InitDoReaddirplus InitFlags = 1 << 13
InitReaddirplusAuto InitFlags = 1 << 14
InitAsyncDIO InitFlags = 1 << 15
InitWritebackCache InitFlags = 1 << 16
InitNoOpenSupport InitFlags = 1 << 17
InitPOSIXACL InitFlags = 1 << 20
InitCaseSensitive InitFlags = 1 << 29 // OS X only
InitVolRename InitFlags = 1 << 30 // OS X only
InitXtimes InitFlags = 1 << 31 // OS X only
)
type flagName struct {
bit uint32
name string
}
var initFlagNames = []flagName{
{uint32(InitAsyncRead), "InitAsyncRead"},
{uint32(InitPosixLocks), "InitPosixLocks"},
{uint32(InitFileOps), "InitFileOps"},
{uint32(InitAtomicTrunc), "InitAtomicTrunc"},
{uint32(InitExportSupport), "InitExportSupport"},
{uint32(InitBigWrites), "InitBigWrites"},
{uint32(InitDontMask), "InitDontMask"},
{uint32(InitSpliceWrite), "InitSpliceWrite"},
{uint32(InitSpliceMove), "InitSpliceMove"},
{uint32(InitSpliceRead), "InitSpliceRead"},
{uint32(InitFlockLocks), "InitFlockLocks"},
{uint32(InitHasIoctlDir), "InitHasIoctlDir"},
{uint32(InitAutoInvalData), "InitAutoInvalData"},
{uint32(InitDoReaddirplus), "InitDoReaddirplus"},
{uint32(InitReaddirplusAuto), "InitReaddirplusAuto"},
{uint32(InitAsyncDIO), "InitAsyncDIO"},
{uint32(InitWritebackCache), "InitWritebackCache"},
{uint32(InitNoOpenSupport), "InitNoOpenSupport"},
{uint32(InitPOSIXACL), "InitPOSIXACL"},
{uint32(InitCaseSensitive), "InitCaseSensitive"},
{uint32(InitVolRename), "InitVolRename"},
{uint32(InitXtimes), "InitXtimes"},
}
func (fl InitFlags) String() string {
return flagString(uint32(fl), initFlagNames)
}
func flagString(f uint32, names []flagName) string {
var s string
if f == 0 {
return "0"
}
for _, n := range names {
if f&n.bit != 0 {
s += "+" + n.name
f &^= n.bit
}
}
if f != 0 {
s += fmt.Sprintf("%+#x", f)
}
return s[1:]
}
// The ReleaseFlags are used in the Release exchange.
type ReleaseFlags uint32
const (
ReleaseFlush ReleaseFlags = 1 << 0
)
func (fl ReleaseFlags) String() string {
return flagString(uint32(fl), releaseFlagNames)
}
var releaseFlagNames = []flagName{
{uint32(ReleaseFlush), "ReleaseFlush"},
}
// Opcodes
const (
opLookup = 1
opForget = 2 // no reply
opGetattr = 3
opSetattr = 4
opReadlink = 5
opSymlink = 6
opMknod = 8
opMkdir = 9
opUnlink = 10
opRmdir = 11
opRename = 12
opLink = 13
opOpen = 14
opRead = 15
opWrite = 16
opStatfs = 17
opRelease = 18
opFsync = 20
opSetxattr = 21
opGetxattr = 22
opListxattr = 23
opRemovexattr = 24
opFlush = 25
opInit = 26
opOpendir = 27
opReaddir = 28
opReleasedir = 29
opFsyncdir = 30
opGetlk = 31
opSetlk = 32
opSetlkw = 33
opAccess = 34
opCreate = 35
opInterrupt = 36
opBmap = 37
opDestroy = 38
opIoctl = 39 // Linux?
opPoll = 40 // Linux?
// OS X
opSetvolname = 61
opGetxtimes = 62
opExchange = 63
)
type entryOut struct {
Nodeid uint64 // Inode ID
Generation uint64 // Inode generation
EntryValid uint64 // Cache timeout for the name
AttrValid uint64 // Cache timeout for the attributes
EntryValidNsec uint32
AttrValidNsec uint32
Attr attr
}
func entryOutSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 9}):
return unsafe.Offsetof(entryOut{}.Attr) + unsafe.Offsetof(entryOut{}.Attr.Blksize)
default:
return unsafe.Sizeof(entryOut{})
}
}
type forgetIn struct {
Nlookup uint64
}
type getattrIn struct {
GetattrFlags uint32
_ uint32
Fh uint64
}
type attrOut struct {
AttrValid uint64 // Cache timeout for the attributes
AttrValidNsec uint32
_ uint32
Attr attr
}
func attrOutSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 9}):
return unsafe.Offsetof(attrOut{}.Attr) + unsafe.Offsetof(attrOut{}.Attr.Blksize)
default:
return unsafe.Sizeof(attrOut{})
}
}
// OS X
type getxtimesOut struct {
Bkuptime uint64
Crtime uint64
BkuptimeNsec uint32
CrtimeNsec uint32
}
type mknodIn struct {
Mode uint32
Rdev uint32
Umask uint32
_ uint32
// "filename\x00" follows.
}
func mknodInSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 12}):
return unsafe.Offsetof(mknodIn{}.Umask)
default:
return unsafe.Sizeof(mknodIn{})
}
}
type mkdirIn struct {
Mode uint32
Umask uint32
// filename follows
}
func mkdirInSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 12}):
return unsafe.Offsetof(mkdirIn{}.Umask) + 4
default:
return unsafe.Sizeof(mkdirIn{})
}
}
type renameIn struct {
Newdir uint64
// "oldname\x00newname\x00" follows
}
// OS X
type exchangeIn struct {
Olddir uint64
Newdir uint64
Options uint64
// "oldname\x00newname\x00" follows
}
type linkIn struct {
Oldnodeid uint64
}
type setattrInCommon struct {
Valid uint32
_ uint32
Fh uint64
Size uint64
LockOwner uint64 // unused on OS X?
Atime uint64
Mtime uint64
Unused2 uint64
AtimeNsec uint32
MtimeNsec uint32
Unused3 uint32
Mode uint32
Unused4 uint32
Uid uint32
Gid uint32
Unused5 uint32
}
type openIn struct {
Flags uint32
Unused uint32
}
type openOut struct {
Fh uint64
OpenFlags uint32
_ uint32
}
type createIn struct {
Flags uint32
Mode uint32
Umask uint32
_ uint32
}
func createInSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 12}):
return unsafe.Offsetof(createIn{}.Umask)
default:
return unsafe.Sizeof(createIn{})
}
}
type releaseIn struct {
Fh uint64
Flags uint32
ReleaseFlags uint32
LockOwner uint32
}
type flushIn struct {
Fh uint64
FlushFlags uint32
_ uint32
LockOwner uint64
}
type readIn struct {
Fh uint64
Offset uint64
Size uint32
ReadFlags uint32
LockOwner uint64
Flags uint32
_ uint32
}
func readInSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 9}):
return unsafe.Offsetof(readIn{}.ReadFlags) + 4
default:
return unsafe.Sizeof(readIn{})
}
}
// The ReadFlags are passed in ReadRequest.
type ReadFlags uint32
const (
// LockOwner field is valid.
ReadLockOwner ReadFlags = 1 << 1
)
var readFlagNames = []flagName{
{uint32(ReadLockOwner), "ReadLockOwner"},
}
func (fl ReadFlags) String() string {
return flagString(uint32(fl), readFlagNames)
}
type writeIn struct {
Fh uint64
Offset uint64
Size uint32
WriteFlags uint32
LockOwner uint64
Flags uint32
_ uint32
}
func writeInSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 9}):
return unsafe.Offsetof(writeIn{}.LockOwner)
default:
return unsafe.Sizeof(writeIn{})
}
}
type writeOut struct {
Size uint32
_ uint32
}
// The WriteFlags are passed in WriteRequest.
type WriteFlags uint32
const (
WriteCache WriteFlags = 1 << 0
// LockOwner field is valid.
WriteLockOwner WriteFlags = 1 << 1
)
var writeFlagNames = []flagName{
{uint32(WriteCache), "WriteCache"},
{uint32(WriteLockOwner), "WriteLockOwner"},
}
func (fl WriteFlags) String() string {
return flagString(uint32(fl), writeFlagNames)
}
const compatStatfsSize = 48
type statfsOut struct {
St kstatfs
}
type fsyncIn struct {
Fh uint64
FsyncFlags uint32
_ uint32
}
type setxattrInCommon struct {
Size uint32
Flags uint32
}
func (setxattrInCommon) position() uint32 {
return 0
}
type getxattrInCommon struct {
Size uint32
_ uint32
}
func (getxattrInCommon) position() uint32 {
return 0
}
type getxattrOut struct {
Size uint32
_ uint32
}
type lkIn struct {
Fh uint64
Owner uint64
Lk fileLock
LkFlags uint32
_ uint32
}
func lkInSize(p Protocol) uintptr {
switch {
case p.LT(Protocol{7, 9}):
return unsafe.Offsetof(lkIn{}.LkFlags)
default:
return unsafe.Sizeof(lkIn{})
}
}
type lkOut struct {
Lk fileLock
}
type accessIn struct {
Mask uint32
_ uint32
}
type initIn struct {
Major uint32
Minor uint32
MaxReadahead uint32
Flags uint32
}
const initInSize = int(unsafe.Sizeof(initIn{}))
type initOut struct {
Major uint32
Minor uint32
MaxReadahead uint32
Flags uint32
Unused uint32
MaxWrite uint32
}
type interruptIn struct {
Unique uint64
}
type bmapIn struct {
Block uint64
BlockSize uint32
_ uint32
}
type bmapOut struct {
Block uint64
}
type inHeader struct {
Len uint32
Opcode uint32
Unique uint64
Nodeid uint64
Uid uint32
Gid uint32
Pid uint32
_ uint32
}
const inHeaderSize = int(unsafe.Sizeof(inHeader{}))
type outHeader struct {
Len uint32
Error int32
Unique uint64
}
const OutHeaderSize = int(unsafe.Sizeof(outHeader{}))
type dirent struct {
Ino uint64
Off uint64
Namelen uint32
Type uint32
Name [0]byte
}
const direntSize = 8 + 8 + 4 + 4
const (
notifyCodePoll int32 = 1
notifyCodeInvalInode int32 = 2
notifyCodeInvalEntry int32 = 3
)
type notifyInvalInodeOut struct {
Ino uint64
Off int64
Len int64
}
type notifyInvalEntryOut struct {
Parent uint64
Namelen uint32
_ uint32
}
package fuse
import "time"
type attr struct {
Ino uint64
Size uint64
Blocks uint64
Atime uint64
Mtime uint64
Ctime uint64
AtimeNsec uint32
MtimeNsec uint32
CtimeNsec uint32
Mode uint32
Nlink uint32
Uid uint32
Gid uint32
Rdev uint32
Blksize uint32
padding uint32
}
func (a *attr) Crtime() time.Time {
return time.Time{}
}
func (a *attr) SetCrtime(s uint64, ns uint32) {
// Ignored on Linux.
}
func (a *attr) SetFlags(f uint32) {
// Ignored on Linux.
}
type setattrIn struct {
setattrInCommon
}
func (in *setattrIn) BkupTime() time.Time {
return time.Time{}
}
func (in *setattrIn) Chgtime() time.Time {
return time.Time{}
}
func (in *setattrIn) Flags() uint32 {
return 0
}
func openFlags(flags uint32) OpenFlags {
// on amd64, the 32-bit O_LARGEFILE flag is always seen;
// on i386, the flag probably depends on the app
// requesting, but in any case should be utterly
// uninteresting to us here; our kernel protocol messages
// are not directly related to the client app's kernel
// API/ABI
flags &^= 0x8000
return OpenFlags(flags)
}
type getxattrIn struct {
getxattrInCommon
}
type setxattrIn struct {
setxattrInCommon
}
package fuseutil // import "github.com/cubefs/cubefs/depends/bazil.org/fuse/fuseutil"
import (
"github.com/cubefs/cubefs/depends/bazil.org/fuse"
)
// HandleRead handles a read request assuming that data is the entire file content.
// It adjusts the amount returned in resp according to req.Offset and req.Size.
func HandleRead(req *fuse.ReadRequest, resp *fuse.ReadResponse, data []byte) {
if req.Offset >= int64(len(data)) {
data = nil
} else {
data = data[req.Offset:]
}
if len(data) > req.Size {
data = data[:req.Size]
}
n := copy(resp.Data[:req.Size], data)
resp.Data = resp.Data[:n]
}
package fuse
import (
"bufio"
"errors"
"io"
"log"
"sync"
)
var (
// ErrOSXFUSENotFound is returned from Mount when the OSXFUSE
// installation is not detected.
//
// Only happens on OS X. Make sure OSXFUSE is installed, or see
// OSXFUSELocations for customization.
ErrOSXFUSENotFound = errors.New("cannot locate OSXFUSE")
)
func neverIgnoreLine(line string) bool {
return false
}
func lineLogger(wg *sync.WaitGroup, prefix string, ignore func(line string) bool, r io.ReadCloser) {
defer wg.Done()
scanner := bufio.NewScanner(r)
for scanner.Scan() {
line := scanner.Text()
if ignore(line) {
continue
}
log.Printf("%s: %s", prefix, line)
}
if err := scanner.Err(); err != nil {
log.Printf("%s, error reading: %v", prefix, err)
}
}
package fuse
import (
"fmt"
"log"
"net"
"os"
"os/exec"
"strings"
"sync"
"syscall"
)
func handleFusermountStderr(errCh chan<- error) func(line string) (ignore bool) {
return func(line string) (ignore bool) {
if line == `fusermount: failed to open /etc/fuse.conf: Permission denied` {
// Silence this particular message, it occurs way too
// commonly and isn't very relevant to whether the mount
// succeeds or not.
return true
}
const (
noMountpointPrefix = `fusermount: failed to access mountpoint `
noMountpointSuffix = `: No such file or directory`
)
if strings.HasPrefix(line, noMountpointPrefix) && strings.HasSuffix(line, noMountpointSuffix) {
// re-extract it from the error message in case some layer
// changed the path
mountpoint := line[len(noMountpointPrefix) : len(line)-len(noMountpointSuffix)]
err := &MountpointDoesNotExistError{
Path: mountpoint,
}
select {
case errCh <- err:
return true
default:
// not the first error; fall back to logging it
return false
}
}
return false
}
}
// isBoringFusermountError returns whether the Wait error is
// uninteresting; exit status 1 is.
func isBoringFusermountError(err error) bool {
if err, ok := err.(*exec.ExitError); ok && err.Exited() {
if status, ok := err.Sys().(syscall.WaitStatus); ok && status.ExitStatus() == 1 {
return true
}
}
return false
}
func mount(dir string, conf *mountConfig, ready chan<- struct{}, errp *error) (fusefd *os.File, err error) {
// linux mount is never delayed
close(ready)
fds, err := syscall.Socketpair(syscall.AF_FILE, syscall.SOCK_STREAM, 0)
if err != nil {
return nil, fmt.Errorf("socketpair error: %v", err)
}
writeFile := os.NewFile(uintptr(fds[0]), "fusermount-child-writes")
defer writeFile.Close()
readFile := os.NewFile(uintptr(fds[1]), "fusermount-parent-reads")
defer readFile.Close()
cmd := exec.Command(
"fusermount",
"-o", conf.getOptions(),
"--",
dir,
)
cmd.Env = append(os.Environ(), "_FUSE_COMMFD=3")
cmd.ExtraFiles = []*os.File{writeFile}
var wg sync.WaitGroup
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("setting up fusermount stderr: %v", err)
}
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, fmt.Errorf("setting up fusermount stderr: %v", err)
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("fusermount: %v", err)
}
helperErrCh := make(chan error, 1)
wg.Add(2)
go lineLogger(&wg, "mount helper output", neverIgnoreLine, stdout)
go lineLogger(&wg, "mount helper error", handleFusermountStderr(helperErrCh), stderr)
wg.Wait()
if err := cmd.Wait(); err != nil {
// see if we have a better error to report
select {
case helperErr := <-helperErrCh:
// log the Wait error if it's not what we expected
if !isBoringFusermountError(err) {
log.Printf("mount helper failed: %v", err)
}
// and now return what we grabbed from stderr as the real
// error
return nil, helperErr
default:
// nope, fall back to generic message
}
return nil, fmt.Errorf("fusermount: %v", err)
}
c, err := net.FileConn(readFile)
if err != nil {
return nil, fmt.Errorf("FileConn from fusermount socket: %v", err)
}
defer c.Close()
uc, ok := c.(*net.UnixConn)
if !ok {
return nil, fmt.Errorf("unexpected FileConn type; expected UnixConn, got %T", c)
}
buf := make([]byte, 32) // expect 1 byte
oob := make([]byte, 32) // expect 24 bytes
_, oobn, _, _, err := uc.ReadMsgUnix(buf, oob)
scms, err := syscall.ParseSocketControlMessage(oob[:oobn])
if err != nil {
return nil, fmt.Errorf("ParseSocketControlMessage: %v", err)
}
if len(scms) != 1 {
return nil, fmt.Errorf("expected 1 SocketControlMessage; got scms = %#v", scms)
}
scm := scms[0]
gotFds, err := syscall.ParseUnixRights(&scm)
if err != nil {
return nil, fmt.Errorf("syscall.ParseUnixRights: %v", err)
}
if len(gotFds) != 1 {
return nil, fmt.Errorf("wanted 1 fd; got %#v", gotFds)
}
f := os.NewFile(uintptr(gotFds[0]), "/dev/fuse")
return f, nil
}
package fuse
import (
"errors"
"strings"
)
func dummyOption(conf *mountConfig) error {
return nil
}
// mountConfig holds the configuration for a mount operation.
// Use it by passing MountOption values to Mount.
type mountConfig struct {
options map[string]string
maxReadahead uint32
initFlags InitFlags
osxfuseLocations []OSXFUSEPaths
RequestTimeout int64
}
func escapeComma(s string) string {
s = strings.Replace(s, `\`, `\\`, -1)
s = strings.Replace(s, `,`, `\,`, -1)
return s
}
// getOptions makes a string of options suitable for passing to FUSE
// mount flag `-o`. Returns an empty string if no options were set.
// Any platform specific adjustments should happen before the call.
func (m *mountConfig) getOptions() string {
var opts []string
for k, v := range m.options {
k = escapeComma(k)
if v != "" {
k += "=" + escapeComma(v)
}
opts = append(opts, k)
}
return strings.Join(opts, ",")
}
type mountOption func(*mountConfig) error
// MountOption is passed to Mount to change the behavior of the mount.
type MountOption mountOption
// FSName sets the file system name (also called source) that is
// visible in the list of mounted file systems.
//
// FreeBSD ignores this option.
func FSName(name string) MountOption {
return func(conf *mountConfig) error {
conf.options["fsname"] = name
return nil
}
}
// Subtype sets the subtype of the mount. The main type is always
// `fuse`. The type in a list of mounted file systems will look like
// `fuse.foo`.
//
// OS X ignores this option.
// FreeBSD ignores this option.
func Subtype(fstype string) MountOption {
return func(conf *mountConfig) error {
conf.options["subtype"] = fstype
return nil
}
}
// LocalVolume sets the volume to be local (instead of network),
// changing the behavior of Finder, Spotlight, and such.
//
// OS X only. Others ignore this option.
func LocalVolume() MountOption {
return localVolume
}
// VolumeName sets the volume name shown in Finder.
//
// OS X only. Others ignore this option.
func VolumeName(name string) MountOption {
return volumeName(name)
}
// NoAppleDouble makes OSXFUSE disallow files with names used by OS X
// to store extended attributes on file systems that do not support
// them natively.
//
// Such file names are:
//
// ._*
// .DS_Store
//
// OS X only. Others ignore this option.
func NoAppleDouble() MountOption {
return noAppleDouble
}
// NoAppleXattr makes OSXFUSE disallow extended attributes with the
// prefix "com.apple.". This disables persistent Finder state and
// other such information.
//
// OS X only. Others ignore this option.
func NoAppleXattr() MountOption {
return noAppleXattr
}
// ExclCreate causes O_EXCL flag to be set for only "truly" exclusive creates,
// i.e. create calls for which the initiator explicitly set the O_EXCL flag.
//
// OSXFUSE expects all create calls to return EEXIST in case the file
// already exists, regardless of whether O_EXCL was specified or not.
// To ensure this behavior, it normally sets OpenExclusive for all
// Create calls, regardless of whether the original call had it set.
// For distributed filesystems, that may force every file create to be
// a distributed consensus action, causing undesirable delays.
//
// This option makes the FUSE filesystem see the original flag value,
// and better decide when to ensure global consensus.
//
// Note that returning EEXIST on existing file create is still
// expected with OSXFUSE, regardless of the presence of the
// OpenExclusive flag.
//
// For more information, see
// https://github.com/osxfuse/osxfuse/issues/209
//
// OS X only. Others ignore this options.
// Requires OSXFUSE 3.4.1 or newer.
func ExclCreate() MountOption {
return exclCreate
}
// DaemonTimeout sets the time in seconds between a request and a reply before
// the FUSE mount is declared dead.
//
// OS X and FreeBSD only. Others ignore this option.
func DaemonTimeout(name string) MountOption {
return daemonTimeout(name)
}
var ErrCannotCombineAllowOtherAndAllowRoot = errors.New("cannot combine AllowOther and AllowRoot")
// AllowOther allows other users to access the file system.
//
// Only one of AllowOther or AllowRoot can be used.
func AllowOther() MountOption {
return func(conf *mountConfig) error {
if _, ok := conf.options["allow_root"]; ok {
return ErrCannotCombineAllowOtherAndAllowRoot
}
conf.options["allow_other"] = ""
return nil
}
}
// AllowRoot allows other users to access the file system.
//
// Only one of AllowOther or AllowRoot can be used.
//
// FreeBSD ignores this option.
func AllowRoot() MountOption {
return func(conf *mountConfig) error {
if _, ok := conf.options["allow_other"]; ok {
return ErrCannotCombineAllowOtherAndAllowRoot
}
conf.options["allow_root"] = ""
return nil
}
}
// AllowDev enables interpreting character or block special devices on the
// filesystem.
func AllowDev() MountOption {
return func(conf *mountConfig) error {
conf.options["dev"] = ""
return nil
}
}
// AllowSUID allows set-user-identifier or set-group-identifier bits to take
// effect.
func AllowSUID() MountOption {
return func(conf *mountConfig) error {
conf.options["suid"] = ""
return nil
}
}
// DefaultPermissions makes the kernel enforce access control based on
// the file mode (as in chmod).
//
// Without this option, the Node itself decides what is and is not
// allowed. This is normally ok because FUSE file systems cannot be
// accessed by other users without AllowOther/AllowRoot.
//
// FreeBSD ignores this option.
func DefaultPermissions() MountOption {
return func(conf *mountConfig) error {
conf.options["default_permissions"] = ""
return nil
}
}
// ReadOnly makes the mount read-only.
func ReadOnly() MountOption {
return func(conf *mountConfig) error {
conf.options["ro"] = ""
return nil
}
}
// MaxReadahead sets the number of bytes that can be prefetched for
// sequential reads. The kernel can enforce a maximum value lower than
// this.
//
// This setting makes the kernel perform speculative reads that do not
// originate from any client process. This usually tremendously
// improves read performance.
func MaxReadahead(n uint32) MountOption {
return func(conf *mountConfig) error {
conf.maxReadahead = n
return nil
}
}
// AsyncRead enables multiple outstanding read requests for the same
// handle. Without this, there is at most one request in flight at a
// time.
func AsyncRead() MountOption {
return func(conf *mountConfig) error {
conf.initFlags |= InitAsyncRead
return nil
}
}
// WritebackCache enables the kernel to buffer writes before sending
// them to the FUSE server. Without this, writethrough caching is
// used.
func WritebackCache() MountOption {
return func(conf *mountConfig) error {
conf.initFlags |= InitWritebackCache
return nil
}
}
func AutoInvalData(enable int64) MountOption {
if enable > 0 {
return func(conf *mountConfig) error {
conf.initFlags |= InitAutoInvalData
return nil
}
}
return func(conf *mountConfig) error {
return nil
}
}
// OSXFUSEPaths describes the paths used by an installed OSXFUSE
// version. See OSXFUSELocationV3 for typical values.
type OSXFUSEPaths struct {
// Prefix for the device file. At mount time, an incrementing
// number is suffixed until a free FUSE device is found.
DevicePrefix string
// Path of the load helper, used to load the kernel extension if
// no device files are found.
Load string
// Path of the mount helper, used for the actual mount operation.
Mount string
// Environment variable used to pass the path to the executable
// calling the mount helper.
DaemonVar string
}
// Default paths for OSXFUSE. See OSXFUSELocations.
var (
OSXFUSELocationV3 = OSXFUSEPaths{
DevicePrefix: "/dev/osxfuse",
Load: "/Library/Filesystems/osxfuse.fs/Contents/Resources/load_osxfuse",
Mount: "/Library/Filesystems/osxfuse.fs/Contents/Resources/mount_osxfuse",
DaemonVar: "MOUNT_OSXFUSE_DAEMON_PATH",
}
OSXFUSELocationV2 = OSXFUSEPaths{
DevicePrefix: "/dev/osxfuse",
Load: "/Library/Filesystems/osxfusefs.fs/Support/load_osxfusefs",
Mount: "/Library/Filesystems/osxfusefs.fs/Support/mount_osxfusefs",
DaemonVar: "MOUNT_FUSEFS_DAEMON_PATH",
}
)
// OSXFUSELocations sets where to look for OSXFUSE files. The
// arguments are all the possible locations. The previous locations
// are replaced.
//
// Without this option, OSXFUSELocationV3 and OSXFUSELocationV2 are
// used.
//
// OS X only. Others ignore this option.
func OSXFUSELocations(paths ...OSXFUSEPaths) MountOption {
return func(conf *mountConfig) error {
if len(paths) == 0 {
return errors.New("must specify at least one location for OSXFUSELocations")
}
// replace previous values, but make a copy so there's no
// worries about caller mutating their slice
conf.osxfuseLocations = append(conf.osxfuseLocations[:0], paths...)
return nil
}
}
// AllowNonEmptyMount allows the mounting over a non-empty directory.
//
// The files in it will be shadowed by the freshly created mount. By
// default these mounts are rejected to prevent accidental covering up
// of data, which could for example prevent automatic backup.
func AllowNonEmptyMount() MountOption {
return func(conf *mountConfig) error {
conf.options["nonempty"] = ""
return nil
}
}
// PosixACL enable posix ACL supported.
func PosixACL() MountOption {
return func(conf *mountConfig) error {
conf.initFlags |= InitPOSIXACL
return nil
}
}
// RequestTimeout set request timeout.
func RequestTimeout(timeout int64) MountOption {
return func(conf *mountConfig) error {
conf.RequestTimeout = timeout
return nil
}
}
package fuse
func localVolume(conf *mountConfig) error {
return nil
}
func volumeName(name string) MountOption {
return dummyOption
}
func daemonTimeout(name string) MountOption {
return dummyOption
}
func noAppleXattr(conf *mountConfig) error {
return nil
}
func noAppleDouble(conf *mountConfig) error {
return nil
}
func exclCreate(conf *mountConfig) error {
return nil
}
package fuse
import (
"fmt"
)
// Protocol is a FUSE protocol version number.
type Protocol struct {
Major uint32
Minor uint32
}
func (p Protocol) String() string {
return fmt.Sprintf("%d.%d", p.Major, p.Minor)
}
// LT returns whether a is less than b.
func (a Protocol) LT(b Protocol) bool {
return a.Major < b.Major ||
(a.Major == b.Major && a.Minor < b.Minor)
}
// GE returns whether a is greater than or equal to b.
func (a Protocol) GE(b Protocol) bool {
return a.Major > b.Major ||
(a.Major == b.Major && a.Minor >= b.Minor)
}
func (a Protocol) is79() bool {
return a.GE(Protocol{7, 9})
}
// HasAttrBlockSize returns whether Attr.BlockSize is respected by the
// kernel.
func (a Protocol) HasAttrBlockSize() bool {
return a.is79()
}
// HasReadWriteFlags returns whether ReadRequest/WriteRequest
// fields Flags and FileFlags are valid.
func (a Protocol) HasReadWriteFlags() bool {
return a.is79()
}
// HasGetattrFlags returns whether GetattrRequest field Flags is
// valid.
func (a Protocol) HasGetattrFlags() bool {
return a.is79()
}
func (a Protocol) is710() bool {
return a.GE(Protocol{7, 10})
}
// HasOpenNonSeekable returns whether OpenResponse field Flags flag
// OpenNonSeekable is supported.
func (a Protocol) HasOpenNonSeekable() bool {
return a.is710()
}
func (a Protocol) is712() bool {
return a.GE(Protocol{7, 12})
}
// HasUmask returns whether CreateRequest/MkdirRequest/MknodRequest
// field Umask is valid.
func (a Protocol) HasUmask() bool {
return a.is712()
}
// HasInvalidate returns whether InvalidateNode/InvalidateEntry are
// supported.
func (a Protocol) HasInvalidate() bool {
return a.is712()
}
package fuse
// Unmount tries to unmount the filesystem mounted at dir.
func Unmount(dir string) error {
return unmount(dir)
}
package fuse
import (
"bytes"
"errors"
"os/exec"
)
func unmount(dir string) error {
cmd := exec.Command("fusermount", "-u", dir)
output, err := cmd.CombinedOutput()
if err != nil {
if len(output) > 0 {
output = bytes.TrimRight(output, "\n")
msg := err.Error() + ": " + string(output)
err = errors.New(msg)
}
return err
}
return nil
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
"strings"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/storage"
)
const (
_ = iota
// KB killobytes
KB = 1 << (10 * iota)
// MB megabytes
MB
)
const (
defaultTickInterval = time.Millisecond * 2000
defaultHeartbeatTick = 1
defaultElectionTick = 5
defaultInflightMsgs = 128
defaultSizeReqBuffer = 2048
defaultSizeAppBuffer = 2048
defaultRetainLogs = 20000
defaultSizeSendBuffer = 10240
defaultReplConcurrency = 5
defaultSnapConcurrency = 10
defaultSizePerMsg = MB
defaultHeartbeatAddr = ":3016"
defaultReplicateAddr = ":2015"
)
// Config contains the parameters to start a raft server.
// Default: Do not use lease mechanism.
// NOTE: NodeID and Resolver must be required.Other parameter has default value.
type Config struct {
TransportConfig
// NodeID is the identity of the local node. NodeID cannot be 0.
// This parameter is required.
NodeID uint64
// TickInterval is the interval of timer which check heartbeat and election timeout.
// The default value is 2s.
TickInterval time.Duration
// HeartbeatTick is the heartbeat interval. A leader sends heartbeat
// message to maintain the leadership every heartbeat interval.
// The default value is 2s.
HeartbeatTick int
// ElectionTick is the election timeout. If a follower does not receive any message
// from the leader of current term during ElectionTick, it will become candidate and start an election.
// ElectionTick must be greater than HeartbeatTick.
// We suggest to use ElectionTick = 10 * HeartbeatTick to avoid unnecessary leader switching.
// The default value is 10s.
ElectionTick int
// MaxSizePerMsg limits the max size of each append message.
// The default value is 1M.
MaxSizePerMsg uint64
// MaxInflightMsgs limits the max number of in-flight append messages during optimistic replication phase.
// The application transportation layer usually has its own sending buffer over TCP/UDP.
// Setting MaxInflightMsgs to avoid overflowing that sending buffer.
// The default value is 128.
MaxInflightMsgs int
// ReqBufferSize limits the max number of recive request chan buffer.
// The default value is 1024.
ReqBufferSize int
// AppBufferSize limits the max number of apply chan buffer.
// The default value is 2048.
AppBufferSize int
// RetainLogs controls how many logs we leave after truncate.
// This is used so that we can quickly replay logs on a follower instead of being forced to send an entire snapshot.
// The default value is 20000.
RetainLogs uint64
// LeaseCheck whether to use the lease mechanism.
// The default value is false.
// (this equal etcd's raft checkQuorum)
LeaseCheck bool
// PreVote enables the Pre-Vote algorithm described in raft thesis section
// 9.6. This prevents disruption when a node that has been partitioned away
// rejoins the cluster.
PreVote bool
// ReadOnlyOption specifies how the read only request is processed.
//
// ReadOnlySafe guarantees the linearizability of the read only request by
// communicating with the quorum. It is the default and suggested option.
//
// ReadOnlyLeaseBased ensures linearizability of the read only request by
// relying on the leader lease. It can be affected by clock drift.
// If the clock drift is unbounded, leader might keep the lease longer than it
// should (clock can move backward/pause without any bound). ReadIndex is not safe
// in that case.
// LeaseCheck MUST be enabled if ReadOnlyOption is ReadOnlyLeaseBased.
ReadOnlyOption ReadOnlyOption
transport Transport
}
// TransportConfig raft server transport config
type TransportConfig struct {
// HeartbeatAddr is the Heartbeat port.
// The default value is 3016.
HeartbeatAddr string
// ReplicateAddr is the Replation port.
// The default value is 2015.
ReplicateAddr string
// 发送队列大小
SendBufferSize int
//复制并发数(node->node)
MaxReplConcurrency int
// MaxSnapConcurrency limits the max number of snapshot concurrency.
// The default value is 10.
MaxSnapConcurrency int
// This parameter is required.
Resolver SocketResolver
}
// RaftConfig contains the parameters to create a raft.
type RaftConfig struct {
ID uint64
Term uint64
Leader uint64
Applied uint64
Peers []proto.Peer
Storage storage.Storage
StateMachine StateMachine
Monitor Monitor
}
// DefaultConfig returns a Config with usable defaults.
func DefaultConfig() *Config {
conf := &Config{
TickInterval: defaultTickInterval,
HeartbeatTick: defaultHeartbeatTick,
ElectionTick: defaultElectionTick,
MaxSizePerMsg: defaultSizePerMsg,
MaxInflightMsgs: defaultInflightMsgs,
ReqBufferSize: defaultSizeReqBuffer,
AppBufferSize: defaultSizeAppBuffer,
RetainLogs: defaultRetainLogs,
LeaseCheck: false,
}
conf.HeartbeatAddr = defaultHeartbeatAddr
conf.ReplicateAddr = defaultReplicateAddr
conf.SendBufferSize = defaultSizeSendBuffer
conf.MaxReplConcurrency = defaultReplConcurrency
conf.MaxSnapConcurrency = defaultSnapConcurrency
return conf
}
// validate returns an error if any required elements of the Config are missing or invalid.
func (c *Config) validate() error {
if c.NodeID == 0 {
return errors.New("NodeID is required")
}
if c.TransportConfig.Resolver == nil {
return errors.New("Resolver is required")
}
if c.MaxSizePerMsg > 4*MB {
return errors.New("MaxSizePerMsg it too high")
}
if c.MaxInflightMsgs > 1024 {
return errors.New("MaxInflightMsgs is too high")
}
if c.MaxSnapConcurrency > 256 {
return errors.New("MaxSnapConcurrency is too high")
}
if c.MaxReplConcurrency > 256 {
return errors.New("MaxReplConcurrency is too high")
}
if c.ReadOnlyOption == ReadOnlyLeaseBased && !c.LeaseCheck {
return errors.New("LeaseCheck MUST be enabled when use ReadOnlyLeaseBased")
}
if strings.TrimSpace(c.TransportConfig.HeartbeatAddr) == "" {
c.TransportConfig.HeartbeatAddr = defaultHeartbeatAddr
}
if strings.TrimSpace(c.TransportConfig.ReplicateAddr) == "" {
c.TransportConfig.ReplicateAddr = defaultReplicateAddr
}
if c.TickInterval < 5*time.Millisecond {
c.TickInterval = defaultTickInterval
}
if c.HeartbeatTick <= 0 {
c.HeartbeatTick = defaultHeartbeatTick
}
if c.ElectionTick <= 0 {
c.ElectionTick = defaultElectionTick
}
if c.MaxSizePerMsg <= 0 {
c.MaxSizePerMsg = defaultSizePerMsg
}
if c.MaxInflightMsgs <= 0 {
c.MaxInflightMsgs = defaultInflightMsgs
}
if c.ReqBufferSize <= 0 {
c.ReqBufferSize = defaultSizeReqBuffer
}
if c.AppBufferSize <= 0 {
c.AppBufferSize = defaultSizeAppBuffer
}
if c.MaxSnapConcurrency <= 0 {
c.MaxSnapConcurrency = defaultSnapConcurrency
}
if c.MaxReplConcurrency <= 0 {
c.MaxReplConcurrency = defaultReplConcurrency
}
if c.SendBufferSize <= 0 {
c.SendBufferSize = defaultSizeSendBuffer
}
return nil
}
// validate returns an error if any required elements of the ReplConfig are missing or invalid.
func (c *RaftConfig) validate() error {
if c.ID == 0 {
return errors.New("ID is required")
}
if len(c.Peers) == 0 {
return errors.New("Peers is required")
}
if c.Storage == nil {
return errors.New("Storage is required")
}
if c.StateMachine == nil {
return errors.New("StateMachine is required")
}
return nil
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
)
var (
ErrCompacted = errors.New("requested index is unavailable due to compaction.")
ErrRaftExists = errors.New("raft already exists.")
ErrRaftNotExists = errors.New("raft not exists.")
ErrNotLeader = errors.New("raft is not the leader.")
ErrStopped = errors.New("raft is already shutdown.")
ErrSnapping = errors.New("raft is doing snapshot.")
ErrRetryLater = errors.New("retry later")
)
type FatalError struct {
ID uint64
Err error
}
// AppPanicError is panic error when repl occurred fatal error.
// The server will recover this panic and stop the shard repl.
type AppPanicError string
func (pe *AppPanicError) Error() string {
return "Occurred application logic panic error: " + string(*pe)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
type respErr struct {
errCh chan error
}
func (e *respErr) init() {
e.errCh = make(chan error, 1)
}
func (e *respErr) respond(err error) {
e.errCh <- err
close(e.errCh)
}
func (e *respErr) error() <-chan error {
return e.errCh
}
// Future the future
type Future struct {
respErr
respCh chan interface{}
}
func newFuture() *Future {
f := &Future{
respCh: make(chan interface{}, 1),
}
f.init()
return f
}
func (f *Future) respond(resp interface{}, err error) {
if err == nil {
f.respCh <- resp
close(f.respCh)
} else {
f.respErr.respond(err)
}
}
// Response wait response
func (f *Future) Response() (resp interface{}, err error) {
select {
case err = <-f.error():
return
case resp = <-f.respCh:
return
}
}
// AsyncResponse export channels
func (f *Future) AsyncResponse() (respCh <-chan interface{}, errCh <-chan error) {
return f.respCh, f.errCh
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package logger
import (
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
)
// Logger encapsulation the log interface.
type Logger interface {
IsEnableDebug() bool
IsEnableInfo() bool
IsEnableWarn() bool
Debug(format string, v ...interface{})
Info(format string, v ...interface{})
Warn(format string, v ...interface{})
Error(format string, v ...interface{})
}
var (
stdLogger = NewDefaultLogger(0)
raftLogger = Logger(stdLogger)
)
func SetLogger(l Logger) {
raftLogger = l
}
func IsEnableDebug() bool {
return raftLogger.IsEnableDebug()
}
func IsEnableInfo() bool {
return raftLogger.IsEnableInfo()
}
func IsEnableWarn() bool {
return raftLogger.IsEnableWarn()
}
func Debug(format string, v ...interface{}) {
raftLogger.Debug(format, v...)
}
func Info(format string, v ...interface{}) {
raftLogger.Info(format, v...)
}
func Warn(format string, v ...interface{}) {
raftLogger.Warn(format, v...)
}
func Error(format string, v ...interface{}) {
raftLogger.Error(format, v...)
}
// DefaultLogger is a default implementation of the Logger interface.
type DefaultLogger struct {
*log.Log
debugEnable bool
infoEnable bool
warnEnable bool
}
func NewDefaultLogger(level int) *DefaultLogger {
logger, err := log.NewLog("", "raft", "DEBUG")
if err != nil {
panic(err)
}
return &DefaultLogger{
Log: logger,
debugEnable: level <= log.DebugLevel,
infoEnable: level <= log.InfoLevel,
warnEnable: level <= log.WarnLevel,
}
}
func (l *DefaultLogger) header(lvl, msg string) string {
return fmt.Sprintf("%s: %s", lvl, msg)
}
func (l *DefaultLogger) IsEnableDebug() bool {
return l.debugEnable
}
func (l *DefaultLogger) Debug(format string, v ...interface{}) {
l.Output(4, l.header("DEBUG", fmt.Sprintf(format, v...)), false)
}
func (l *DefaultLogger) IsEnableInfo() bool {
return l.infoEnable
}
func (l *DefaultLogger) Info(format string, v ...interface{}) {
l.Output(4, l.header("INFO", fmt.Sprintf(format, v...)), false)
}
func (l *DefaultLogger) IsEnableWarn() bool {
return l.warnEnable
}
func (l *DefaultLogger) Warn(format string, v ...interface{}) {
l.Output(4, l.header("WARN", fmt.Sprintf(format, v...)), false)
}
func (l *DefaultLogger) Error(format string, v ...interface{}) {
l.Output(4, l.header("ERROR", fmt.Sprintf(format, v...)), false)
}
type FileLogger struct {
*log.Log
debugEnable bool
infoEnable bool
warnEnable bool
}
func NewFileLogger(logger *log.Log, level int) *FileLogger {
return &FileLogger{
Log: logger,
debugEnable: level <= log.DebugLevel,
infoEnable: level <= log.InfoLevel,
warnEnable: level <= log.WarnLevel,
}
}
func (fl *FileLogger) IsEnableDebug() bool {
return fl.debugEnable
}
func (fl *FileLogger) Debug(format string, v ...interface{}) {
fl.Debug(fmt.Sprintf(format, v...))
}
func (fl *FileLogger) IsEnableInfo() bool {
return fl.infoEnable
}
func (fl *FileLogger) Info(format string, v ...interface{}) {
fl.Info(fmt.Sprintf(format, v...))
}
func (fl *FileLogger) IsEnableWarn() bool {
return fl.warnEnable
}
func (fl *FileLogger) Warn(format string, v ...interface{}) {
fl.Warn(fmt.Sprintf(format, v...))
}
func (fl *FileLogger) Error(format string, v ...interface{}) {
fl.Error(fmt.Sprintf(format, v...))
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"sync"
)
var pool = newPoolFactory()
type poolFactory struct {
applyPool *sync.Pool
proposalPool *sync.Pool
}
func newPoolFactory() *poolFactory {
return &poolFactory{
applyPool: &sync.Pool{
New: func() interface{} {
return new(apply)
},
},
proposalPool: &sync.Pool{
New: func() interface{} {
return new(proposal)
},
},
}
}
func (f *poolFactory) getApply() *apply {
a := f.applyPool.Get().(*apply)
a.command = nil
a.future = nil
a.readIndexes = nil
return a
}
func (f *poolFactory) returnApply(a *apply) {
if a != nil {
f.applyPool.Put(a)
}
}
func (f *poolFactory) getProposal() *proposal {
p := f.proposalPool.Get().(*proposal)
p.data = nil
p.future = nil
return p
}
func (f *poolFactory) returnProposal(p *proposal) {
if p != nil {
f.proposalPool.Put(p)
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proto
import (
"encoding/binary"
"fmt"
"io"
"sort"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
const (
version1 byte = 1
peer_size uint64 = 11
entry_header uint64 = 17
snapmeta_header uint64 = 20
message_header uint64 = 68
)
// Peer codec
func (p *Peer) Encode(datas []byte) {
datas[0] = byte(p.Type)
binary.BigEndian.PutUint16(datas[1:], p.Priority)
binary.BigEndian.PutUint64(datas[3:], p.ID)
}
func (p *Peer) Decode(datas []byte) {
p.Type = PeerType(datas[0])
p.Priority = binary.BigEndian.Uint16(datas[1:])
p.ID = binary.BigEndian.Uint64(datas[3:])
}
// HardState codec
func (c *HardState) Encode(datas []byte) {
binary.BigEndian.PutUint64(datas[0:], c.Term)
binary.BigEndian.PutUint64(datas[8:], c.Commit)
binary.BigEndian.PutUint64(datas[16:], c.Vote)
}
func (c *HardState) Decode(datas []byte) {
c.Term = binary.BigEndian.Uint64(datas[0:])
c.Commit = binary.BigEndian.Uint64(datas[8:])
c.Vote = binary.BigEndian.Uint64(datas[16:])
}
func (c *HardState) Size() uint64 {
return 24
}
// ConfChange codec
func (c *ConfChange) Encode() []byte {
datas := make([]byte, 1+peer_size+uint64(len(c.Context)))
datas[0] = byte(c.Type)
c.Peer.Encode(datas[1:])
if len(c.Context) > 0 {
copy(datas[peer_size+1:], c.Context)
}
return datas
}
func (c *ConfChange) Decode(datas []byte) {
c.Type = ConfChangeType(datas[0])
c.Peer.Decode(datas[1:])
if uint64(len(datas)) > peer_size+1 {
c.Context = append([]byte{}, datas[peer_size+1:]...)
}
}
// SnapshotMeta codec
func (m *SnapshotMeta) Size() uint64 {
return snapmeta_header + peer_size*uint64(len(m.Peers))
}
func (m *SnapshotMeta) Encode(w io.Writer) error {
buf := getByteSlice()
defer returnByteSlice(buf)
binary.BigEndian.PutUint64(buf, m.Index)
binary.BigEndian.PutUint64(buf[8:], m.Term)
binary.BigEndian.PutUint32(buf[16:], uint32(len(m.Peers)))
if _, err := w.Write(buf[0:snapmeta_header]); err != nil {
return err
}
for _, p := range m.Peers {
p.Encode(buf)
if _, err := w.Write(buf[0:peer_size]); err != nil {
return err
}
}
return nil
}
func (m *SnapshotMeta) Decode(datas []byte) {
m.Index = binary.BigEndian.Uint64(datas)
m.Term = binary.BigEndian.Uint64(datas[8:])
size := binary.BigEndian.Uint32(datas[16:])
m.Peers = make([]Peer, size)
start := snapmeta_header
for i := uint32(0); i < size; i++ {
m.Peers[i].Decode(datas[start:])
start = start + peer_size
}
}
// Entry codec
func (e *Entry) Size() uint64 {
return entry_header + uint64(len(e.Data))
}
func (e *Entry) Encode(w io.Writer) error {
buf := getByteSlice()
defer returnByteSlice(buf)
buf[0] = byte(e.Type)
binary.BigEndian.PutUint64(buf[1:], e.Term)
binary.BigEndian.PutUint64(buf[9:], e.Index)
if _, err := w.Write(buf[0:entry_header]); err != nil {
return err
}
if len(e.Data) > 0 {
if _, err := w.Write(e.Data); err != nil {
return err
}
}
return nil
}
func (e *Entry) Decode(datas []byte) {
e.Type = EntryType(datas[0])
e.Term = binary.BigEndian.Uint64(datas[1:])
e.Index = binary.BigEndian.Uint64(datas[9:])
if uint64(len(datas)) > entry_header {
e.Data = datas[entry_header:]
}
}
// Message codec
func (m *Message) Size() uint64 {
if m.Type == ReqMsgSnapShot {
return message_header + m.SnapshotMeta.Size()
}
size := message_header + 4
if len(m.Entries) > 0 {
for _, e := range m.Entries {
size = size + e.Size() + 4
}
}
if len(m.Context) > 0 {
size = size + uint64(len(m.Context))
}
return size
}
func (m *Message) Encode(w io.Writer) error {
buf := getByteSlice()
defer returnByteSlice(buf)
binary.BigEndian.PutUint32(buf, uint32(m.Size()))
buf[4] = version1
buf[5] = byte(m.Type)
if m.ForceVote {
buf[6] = 1
} else {
buf[6] = 0
}
if m.Reject {
buf[7] = 1
} else {
buf[7] = 0
}
binary.BigEndian.PutUint64(buf[8:], m.RejectHint)
binary.BigEndian.PutUint64(buf[16:], m.ID)
binary.BigEndian.PutUint64(buf[24:], m.From)
binary.BigEndian.PutUint64(buf[32:], m.To)
binary.BigEndian.PutUint64(buf[40:], m.Term)
binary.BigEndian.PutUint64(buf[48:], m.LogTerm)
binary.BigEndian.PutUint64(buf[56:], m.Index)
binary.BigEndian.PutUint64(buf[64:], m.Commit)
if _, err := w.Write(buf[0 : message_header+4]); err != nil {
return err
}
if m.Type == ReqMsgSnapShot {
return m.SnapshotMeta.Encode(w)
}
binary.BigEndian.PutUint32(buf, uint32(len(m.Entries)))
if _, err := w.Write(buf[0:4]); err != nil {
return err
}
if len(m.Entries) > 0 {
for _, e := range m.Entries {
binary.BigEndian.PutUint32(buf, uint32(e.Size()))
if _, err := w.Write(buf[0:4]); err != nil {
return err
}
if err := e.Encode(w); err != nil {
return err
}
}
}
if len(m.Context) > 0 {
if _, err := w.Write(m.Context); err != nil {
return err
}
}
return nil
}
func (m *Message) Decode(r *util.BufferReader) error {
var (
datas []byte
err error
)
if datas, err = r.ReadFull(4); err != nil {
return err
}
cnt := int(binary.BigEndian.Uint32(datas))
if cnt > 256*1024*1024 {
return fmt.Errorf("msg len is too big, please check, %d", cnt)
}
if datas, err = r.ReadFull(cnt); err != nil {
return err
}
if len(datas) == 0 {
return nil
}
ver := datas[0]
if ver == version1 {
m.Type = MsgType(datas[1])
m.ForceVote = (datas[2] == 1)
m.Reject = (datas[3] == 1)
m.RejectHint = binary.BigEndian.Uint64(datas[4:])
m.ID = binary.BigEndian.Uint64(datas[12:])
m.From = binary.BigEndian.Uint64(datas[20:])
m.To = binary.BigEndian.Uint64(datas[28:])
m.Term = binary.BigEndian.Uint64(datas[36:])
m.LogTerm = binary.BigEndian.Uint64(datas[44:])
m.Index = binary.BigEndian.Uint64(datas[52:])
m.Commit = binary.BigEndian.Uint64(datas[60:])
if m.Type == ReqMsgSnapShot {
m.SnapshotMeta.Decode(datas[message_header:])
} else {
size := binary.BigEndian.Uint32(datas[message_header:])
start := message_header + 4
if size > 0 {
for i := uint32(0); i < size; i++ {
esize := binary.BigEndian.Uint32(datas[start:])
start = start + 4
end := start + uint64(esize)
entry := new(Entry)
entry.Decode(datas[start:end])
m.Entries = append(m.Entries, entry)
start = end
}
}
if start < uint64(len(datas)) {
m.Context = datas[start:]
}
}
}
return nil
}
func EncodeHBConext(ctx HeartbeatContext) (buf []byte) {
sort.Slice(ctx, func(i, j int) bool {
return ctx[i] < ctx[j]
})
scratch := make([]byte, binary.MaxVarintLen64)
prev := uint64(0)
for _, id := range ctx {
n := binary.PutUvarint(scratch, id-prev)
buf = append(buf, scratch[:n]...)
prev = id
}
return
}
func DecodeHBContext(buf []byte) (ctx HeartbeatContext) {
prev := uint64(0)
for len(buf) > 0 {
id, n := binary.Uvarint(buf)
ctx = append(ctx, id+prev)
prev = id + prev
buf = buf[n:]
}
return
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proto
import (
"sync"
)
var (
msgPool = &sync.Pool{
New: func() interface{} {
return &Message{
Entries: make([]*Entry, 0, 128),
}
},
}
bytePool = &sync.Pool{
New: func() interface{} {
return make([]byte, 128)
},
}
)
func GetMessage() *Message {
msg := msgPool.Get().(*Message)
msg.Reject = false
msg.RejectHint = 0
msg.ID = 0
msg.From = 0
msg.To = 0
msg.Term = 0
msg.LogTerm = 0
msg.Index = 0
msg.Commit = 0
msg.SnapshotMeta.Index = 0
msg.SnapshotMeta.Term = 0
msg.SnapshotMeta.Peers = nil
msg.Snapshot = nil
msg.Context = nil
msg.Entries = msg.Entries[0:0]
return msg
}
func ReturnMessage(msg *Message) {
if msg != nil {
msgPool.Put(msg)
}
}
func getByteSlice() []byte {
return bytePool.Get().([]byte)
}
func returnByteSlice(b []byte) {
bytePool.Put(b)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proto
import (
"fmt"
)
type (
MsgType byte
EntryType byte
ConfChangeType byte
PeerType byte
)
const (
ReqMsgAppend MsgType = iota
ReqMsgPreVote
ReqMsgHeartBeat
ReqMsgSnapShot
ReqMsgVote
RespMsgAppend
RespMsgPreVote
RespMsgHeartBeat
RespMsgSnapShot
RespMsgVote
LocalMsgHup
LocalMsgProp
LeaseMsgOffline
LeaseMsgTimeout
ReqCheckQuorum
RespCheckQuorum
)
const (
ConfAddNode ConfChangeType = 0
ConfRemoveNode ConfChangeType = 1
ConfUpdateNode ConfChangeType = 2
EntryNormal EntryType = 0
EntryConfChange EntryType = 1
PeerNormal PeerType = 0
PeerArbiter PeerType = 1
)
// The Snapshot interface is supplied by the application to access the snapshot data of application.
type Snapshot interface {
SnapIterator
ApplyIndex() uint64
Close()
}
type SnapIterator interface {
// if error=io.EOF represent snapshot terminated.
Next() ([]byte, error)
}
type SnapshotMeta struct {
Index uint64
Term uint64
Peers []Peer
}
type Peer struct {
Type PeerType
Priority uint16
ID uint64 // NodeID
PeerID uint64 // Replica ID, unique over all raft groups and all replicas in the same group
}
// HardState is the repl state,must persist to the storage.
type HardState struct {
Term uint64
Commit uint64
Vote uint64
}
// Entry is the repl log entry.
type Entry struct {
Type EntryType
Term uint64
Index uint64
Data []byte
}
// Message is the transport message.
type Message struct {
Type MsgType
ForceVote bool
Reject bool
RejectHint uint64
ID uint64
From uint64
To uint64
Term uint64
LogTerm uint64
Index uint64
Commit uint64
SnapshotMeta SnapshotMeta
Entries []*Entry
Context []byte
Snapshot Snapshot // No need for codec
}
func (m *Message) ToString() (mesg string) {
return fmt.Sprintf("Mesg:[%v] type(%v) ForceVote(%v) Reject(%v) RejectHint(%v) "+
"From(%v) To(%v) Term(%v) LogTrem(%v) Index(%v) Commit(%v)", m.ID, m.Type.String(), m.ForceVote,
m.Reject, m.RejectHint, m.From, m.To, m.Term, m.LogTerm, m.Index, m.Commit)
}
type ConfChange struct {
Type ConfChangeType
Peer Peer
Context []byte
}
type HeartbeatContext []uint64
func (t MsgType) String() string {
switch t {
case 0:
return "ReqMsgAppend"
case 1:
return "ReqMsgPreVote"
case 2:
return "ReqMsgHeartBeat"
case 3:
return "ReqMsgSnapShot"
case 4:
return "ReqMsgVote"
case 5:
return "RespMsgAppend"
case 6:
return "RespMsgPreVote"
case 7:
return "RespMsgHeartBeat"
case 8:
return "RespMsgSnapShot"
case 9:
return "RespMsgVote"
case 10:
return "LocalMsgHup"
case 11:
return "LocalMsgProp"
case 12:
return "LeaseMsgOffline"
case 13:
return "LeaseMsgTimeout"
case 14:
return "ReqCheckQuorum"
case 15:
return "RespCheckQuorum"
}
return "unknown"
}
func (t EntryType) String() string {
switch t {
case 0:
return "EntryNormal"
case 1:
return "EntryConfChange"
}
return "unknown"
}
func (t ConfChangeType) String() string {
switch t {
case 0:
return "ConfAddNode"
case 1:
return "ConfRemoveNode"
case 2:
return "ConfUpdateNode"
}
return "unknown"
}
func (t PeerType) String() string {
switch t {
case 0:
return "PeerNormal"
case 1:
return "PeerArbiter"
}
return "unknown"
}
func (p Peer) String() string {
return fmt.Sprintf(`"nodeID":"%v","peerID":"%v","priority":"%v","type":"%v"`,
p.ID, p.PeerID, p.Priority, p.Type.String())
}
func (cc *ConfChange) String() string {
return fmt.Sprintf(`{"type":"%v",%v}`, cc.Type, cc.Peer.String())
}
func (m *Message) IsResponseMsg() bool {
return m.Type == RespMsgAppend || m.Type == RespMsgHeartBeat || m.Type == RespMsgVote ||
m.Type == RespMsgPreVote || m.Type == RespMsgSnapShot || m.Type == RespCheckQuorum
}
func (m *Message) IsElectionMsg() bool {
return m.Type == ReqMsgHeartBeat || m.Type == RespMsgHeartBeat || m.Type == ReqMsgVote || m.Type == RespMsgVote ||
m.Type == ReqMsgPreVote || m.Type == RespMsgPreVote || m.Type == LeaseMsgOffline || m.Type == LeaseMsgTimeout
}
func (m *Message) IsHeartbeatMsg() bool {
return m.Type == ReqMsgHeartBeat || m.Type == RespMsgHeartBeat
}
func (s *HardState) IsEmpty() bool {
return s.Term == 0 && s.Vote == 0 && s.Commit == 0
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"runtime"
"sync"
"sync/atomic"
"unsafe"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type proposal struct {
cmdType proto.EntryType
future *Future
data []byte
}
type apply struct {
term uint64
index uint64
future *Future
command interface{}
readIndexes []*Future
}
// handle user's get log entries request
type entryRequest struct {
future *Future
index uint64
maxSize uint64
onlyCommit bool
}
type softState struct {
leader uint64
term uint64
}
type peerState struct {
peers map[uint64]proto.Peer
mu sync.RWMutex
}
type monitorStatus struct {
conErrCount uint8
replicasErrCnt map[uint64]uint8
}
func (s *peerState) change(c *proto.ConfChange) {
s.mu.Lock()
switch c.Type {
case proto.ConfAddNode:
s.peers[c.Peer.ID] = c.Peer
case proto.ConfRemoveNode:
delete(s.peers, c.Peer.ID)
case proto.ConfUpdateNode:
s.peers[c.Peer.ID] = c.Peer
}
s.mu.Unlock()
}
func (s *peerState) replace(peers []proto.Peer) {
s.mu.Lock()
s.peers = nil
s.peers = make(map[uint64]proto.Peer)
for _, p := range peers {
s.peers[p.ID] = p
}
s.mu.Unlock()
}
func (s *peerState) get() (nodes []uint64) {
s.mu.RLock()
for n := range s.peers {
nodes = append(nodes, n)
}
s.mu.RUnlock()
return
}
type raft struct {
raftFsm *raftFsm
config *Config
raftConfig *RaftConfig
restoringSnapshot util.AtomicBool
curApplied util.AtomicUInt64
curSoftSt unsafe.Pointer
prevSoftSt softState
prevHardSt proto.HardState
peerState peerState
pending map[uint64]*Future
snapping map[uint64]*snapshotStatus
mStatus *monitorStatus
propc chan *proposal
applyc chan *apply
recvc chan *proto.Message
snapRecvc chan *snapshotRequest
truncatec chan uint64
readIndexC chan *Future
statusc chan chan *Status
entryRequestC chan *entryRequest
readyc chan struct{}
tickc chan struct{}
electc chan struct{}
stopc chan struct{}
done chan struct{}
mu sync.Mutex
}
func newRaft(config *Config, raftConfig *RaftConfig) (*raft, error) {
defer util.HandleCrash()
if err := raftConfig.validate(); err != nil {
return nil, err
}
r, err := newRaftFsm(config, raftConfig)
if err != nil {
return nil, err
}
mStatus := &monitorStatus{
conErrCount: 0,
replicasErrCnt: make(map[uint64]uint8),
}
raft := &raft{
raftFsm: r,
config: config,
raftConfig: raftConfig,
mStatus: mStatus,
pending: make(map[uint64]*Future),
snapping: make(map[uint64]*snapshotStatus),
recvc: make(chan *proto.Message, config.ReqBufferSize),
applyc: make(chan *apply, config.AppBufferSize),
propc: make(chan *proposal, 256),
snapRecvc: make(chan *snapshotRequest, 1),
truncatec: make(chan uint64, 1),
readIndexC: make(chan *Future, 256),
statusc: make(chan chan *Status, 1),
entryRequestC: make(chan *entryRequest, 16),
tickc: make(chan struct{}, 64),
readyc: make(chan struct{}, 1),
electc: make(chan struct{}, 1),
stopc: make(chan struct{}),
done: make(chan struct{}),
}
raft.curApplied.Set(r.raftLog.applied)
raft.peerState.replace(raftConfig.Peers)
util.RunWorker(raft.runApply, raft.handlePanic)
util.RunWorker(raft.run, raft.handlePanic)
return raft, nil
}
func (s *raft) stop() {
select {
case <-s.done:
return
default:
s.doStop()
}
<-s.done
}
func (s *raft) doStop() {
s.mu.Lock()
defer s.mu.Unlock()
select {
case <-s.stopc:
return
default:
s.raftFsm.StopFsm()
close(s.stopc)
s.restoringSnapshot.Set(false)
}
}
func (s *raft) runApply() {
defer func() {
s.doStop()
s.resetApply()
}()
loopCount := 0
for {
loopCount = loopCount + 1
if loopCount > 16 {
loopCount = 0
runtime.Gosched()
}
select {
case <-s.stopc:
return
case apply := <-s.applyc:
if apply.index <= s.curApplied.Get() {
if len(apply.readIndexes) > 0 {
respondReadIndex(apply.readIndexes, nil)
}
continue
}
var (
err error
resp interface{}
)
switch cmd := apply.command.(type) {
case *proto.ConfChange:
resp, err = s.raftConfig.StateMachine.ApplyMemberChange(cmd, apply.index)
if cmd.Type == proto.ConfRemoveNode && err == nil {
s.raftFsm.mo.RemovePeer(s.raftFsm.id, cmd.Peer)
}
case []byte:
resp, err = s.raftConfig.StateMachine.Apply(cmd, apply.index)
}
if apply.future != nil {
apply.future.respond(resp, err)
}
if len(apply.readIndexes) > 0 {
respondReadIndex(apply.readIndexes, nil)
}
s.curApplied.Set(apply.index)
pool.returnApply(apply)
}
}
}
func (s *raft) run() {
defer func() {
s.doStop()
s.resetPending(ErrStopped)
s.raftFsm.readOnly.reset(ErrStopped)
s.stopSnapping()
s.raftConfig.Storage.Close()
close(s.done)
}()
s.prevHardSt.Term = s.raftFsm.term
s.prevHardSt.Vote = s.raftFsm.vote
s.prevHardSt.Commit = s.raftFsm.raftLog.committed
s.maybeChange(true)
loopCount := 0
var readyc chan struct{}
for {
if readyc == nil && s.containsUpdate() {
readyc = s.readyc
readyc <- struct{}{}
}
select {
case <-s.stopc:
return
case <-s.tickc:
s.raftFsm.tick()
s.maybeChange(true)
case pr := <-s.propc:
if s.raftFsm.leader != s.config.NodeID {
pr.future.respond(nil, ErrNotLeader)
pool.returnProposal(pr)
break
}
msg := proto.GetMessage()
msg.Type = proto.LocalMsgProp
msg.From = s.config.NodeID
starti := s.raftFsm.raftLog.lastIndex() + 1
s.pending[starti] = pr.future
msg.Entries = append(msg.Entries, &proto.Entry{Term: s.raftFsm.term, Index: starti, Type: pr.cmdType, Data: pr.data})
pool.returnProposal(pr)
flag := false
for i := 1; i < 64; i++ {
starti = starti + 1
select {
case pr := <-s.propc:
s.pending[starti] = pr.future
msg.Entries = append(msg.Entries, &proto.Entry{Term: s.raftFsm.term, Index: starti, Type: pr.cmdType, Data: pr.data})
pool.returnProposal(pr)
default:
flag = true
}
if flag {
break
}
}
s.raftFsm.Step(msg)
case m := <-s.recvc:
if _, ok := s.raftFsm.replicas[m.From]; ok || (!m.IsResponseMsg() && m.Type != proto.ReqMsgVote) ||
(m.Type == proto.ReqMsgVote && s.raftFsm.raftLog.isUpToDate(m.Index, m.LogTerm, 0, 0)) {
switch m.Type {
case proto.ReqMsgHeartBeat:
// if s.raftFsm.leader == no leader, also need handler heartbeat request.
// Otherwise PreCandidate will not change his state to Follower.
// So remove the condition s.raftFsm.leader == m.From
if m.From != s.config.NodeID {
s.raftFsm.Step(m)
}
case proto.RespMsgHeartBeat:
if s.raftFsm.leader == s.config.NodeID && m.From != s.config.NodeID {
s.raftFsm.Step(m)
}
default:
s.raftFsm.Step(m)
}
var respErr = true
if m.Type == proto.RespMsgAppend && !m.Reject {
respErr = false
}
s.maybeChange(respErr)
} else if logger.IsEnableWarn() && m.Type != proto.RespMsgHeartBeat {
logger.Warn(" [raft] [%v term: %d] raftFm[%p] raftReplicas[%v] ignored a %s message "+
"without the replica from [%v term: %d].",
s.raftFsm.id, s.raftFsm.term, s.raftFsm, s.raftFsm.getReplicas(), m.Type, m.From, m.Term)
}
case snapReq := <-s.snapRecvc:
s.handleSnapshot(snapReq)
case <-readyc:
s.persist()
s.apply()
s.advance()
// Send all messages.
for _, msg := range s.raftFsm.msgs {
if msg.Type == proto.ReqMsgSnapShot {
s.sendSnapshot(msg)
continue
}
s.sendMessage(msg)
}
s.raftFsm.msgs = nil
readyc = nil
loopCount = loopCount + 1
if loopCount >= 2 {
loopCount = 0
runtime.Gosched()
}
case <-s.electc:
msg := proto.GetMessage()
msg.Type = proto.LocalMsgHup
msg.From = s.config.NodeID
msg.ForceVote = true
logger.Debug("raft[%v] node %v try to leader", s.raftFsm.id, s.config.NodeID)
s.raftFsm.Step(msg)
s.maybeChange(true)
case c := <-s.statusc:
c <- s.getStatus()
case truncIndex := <-s.truncatec:
func() {
defer util.HandleCrash()
if lasti, err := s.raftConfig.Storage.LastIndex(); err != nil {
logger.Error("raft[%v] truncate failed to get last index from storage: %v", s.raftFsm.id, err)
} else if lasti > s.config.RetainLogs {
maxIndex := util.Min(truncIndex, lasti-s.config.RetainLogs)
if err = s.raftConfig.Storage.Truncate(maxIndex); err != nil {
logger.Error("raft[%v] truncate failed,error is: %v", s.raftFsm.id, err)
}
}
}()
case future := <-s.readIndexC:
futures := []*Future{future}
// handle in batch
var flag bool
for i := 1; i < 64; i++ {
select {
case f := <-s.readIndexC:
futures = append(futures, f)
default:
flag = true
}
if flag {
break
}
}
s.raftFsm.addReadIndex(futures)
case req := <-s.entryRequestC:
s.getEntriesInLoop(req)
}
}
}
func (s *raft) tick() {
if s.restoringSnapshot.Get() {
return
}
select {
case <-s.stopc:
case s.tickc <- struct{}{}:
default:
return
}
}
func (s *raft) propose(cmd []byte, future *Future) {
if !s.isLeader() {
future.respond(nil, ErrNotLeader)
return
}
pr := pool.getProposal()
pr.cmdType = proto.EntryNormal
pr.data = cmd
pr.future = future
select {
case <-s.stopc:
future.respond(nil, ErrStopped)
case s.propc <- pr:
}
}
func (s *raft) proposeMemberChange(cc *proto.ConfChange, future *Future) {
if !s.isLeader() {
future.respond(nil, ErrNotLeader)
return
}
pr := pool.getProposal()
pr.cmdType = proto.EntryConfChange
pr.future = future
pr.data = cc.Encode()
select {
case <-s.stopc:
future.respond(nil, ErrStopped)
case s.propc <- pr:
}
}
func (s *raft) reciveMessage(m *proto.Message) {
if s.restoringSnapshot.Get() {
return
}
select {
case <-s.stopc:
case s.recvc <- m:
default:
logger.Warn(fmt.Sprintf("raft[%v] discard message(%v)", s.raftConfig.ID, m.ToString()))
return
}
}
func (s *raft) reciveSnapshot(m *snapshotRequest) {
if s.restoringSnapshot.Get() {
m.respond(ErrSnapping)
return
}
select {
case <-s.stopc:
m.respond(ErrStopped)
return
case s.snapRecvc <- m:
}
}
func (s *raft) status() *Status {
if s.restoringSnapshot.Get() {
return &Status{
ID: s.raftFsm.id,
NodeID: s.config.NodeID,
RestoringSnapshot: true,
State: stateFollower.String(),
}
}
c := make(chan *Status, 1)
select {
case <-s.stopc:
return nil
case s.statusc <- c:
return <-c
}
}
func (s *raft) truncate(index uint64) {
logger.Debug("raft[%v] truncate index %v", s.raftFsm.id, index)
if s.restoringSnapshot.Get() {
return
}
select {
case <-s.stopc:
case s.truncatec <- index:
default:
return
}
}
func (s *raft) tryToLeader(future *Future) {
if s.restoringSnapshot.Get() {
future.respond(nil, nil)
return
}
select {
case <-s.stopc:
future.respond(nil, ErrStopped)
case s.electc <- struct{}{}:
future.respond(nil, nil)
}
}
func (s *raft) leaderTerm() (leader, term uint64) {
st := (*softState)(atomic.LoadPointer(&s.curSoftSt))
if st == nil {
return NoLeader, 0
}
return st.leader, st.term
}
func (s *raft) isLeader() bool {
leader, _ := s.leaderTerm()
return leader == s.config.NodeID
}
func (s *raft) applied() uint64 {
return s.curApplied.Get()
}
func (s *raft) committed() uint64 {
return s.raftFsm.raftLog.committed
}
func (s *raft) sendMessage(m *proto.Message) {
s.config.transport.Send(m)
}
func (s *raft) maybeChange(respErr bool) {
updated := false
if s.prevSoftSt.term != s.raftFsm.term {
updated = true
s.prevSoftSt.term = s.raftFsm.term
s.resetTick()
}
preLeader := s.prevSoftSt.leader
if preLeader != s.raftFsm.leader {
updated = true
s.prevSoftSt.leader = s.raftFsm.leader
if s.raftFsm.leader != s.config.NodeID {
if respErr || preLeader != s.config.NodeID {
s.resetPending(ErrNotLeader)
}
s.stopSnapping()
}
if logger.IsEnableWarn() {
if s.raftFsm.leader != NoLeader {
if preLeader == NoLeader {
logger.Warn("raft:[%v] elected leader %v at term %d.", s.raftFsm.id, s.raftFsm.leader, s.raftFsm.term)
} else {
logger.Warn("raft:[%v] changed leader from %v to %v at term %d.", s.raftFsm.id, preLeader, s.raftFsm.leader, s.raftFsm.term)
}
} else {
logger.Warn("raft:[%v] lost leader %v at term %d.", s.raftFsm.id, preLeader, s.raftFsm.term)
}
}
s.raftConfig.StateMachine.HandleLeaderChange(s.raftFsm.leader)
}
if updated {
atomic.StorePointer(&s.curSoftSt, unsafe.Pointer(&softState{leader: s.raftFsm.leader, term: s.raftFsm.term}))
}
}
func (s *raft) persist() {
unstableEntries := s.raftFsm.raftLog.unstableEntries()
if len(unstableEntries) > 0 {
if err := s.raftConfig.Storage.StoreEntries(unstableEntries); err != nil {
panic(AppPanicError(fmt.Sprintf("[raft->persist][%v] storage storeEntries err: [%v].", s.raftFsm.id, err)))
}
}
if s.raftFsm.raftLog.committed != s.prevHardSt.Commit || s.raftFsm.term != s.prevHardSt.Term || s.raftFsm.vote != s.prevHardSt.Vote {
hs := proto.HardState{Term: s.raftFsm.term, Vote: s.raftFsm.vote, Commit: s.raftFsm.raftLog.committed}
if err := s.raftConfig.Storage.StoreHardState(hs); err != nil {
panic(AppPanicError(fmt.Sprintf("[raft->persist][%v] storage storeHardState err: [%v].", s.raftFsm.id, err)))
}
s.prevHardSt = hs
}
}
func (s *raft) apply() {
committedEntries := s.raftFsm.raftLog.nextEnts(noLimit)
// check ready read index
if len(committedEntries) == 0 {
readIndexes := s.raftFsm.readOnly.getReady(s.curApplied.Get())
if len(readIndexes) == 0 {
return
}
apply := pool.getApply()
apply.readIndexes = readIndexes
select {
case <-s.stopc:
respondReadIndex(readIndexes, ErrStopped)
case s.applyc <- apply:
}
return
}
for _, entry := range committedEntries {
apply := pool.getApply()
apply.term = entry.Term
apply.index = entry.Index
if future, ok := s.pending[entry.Index]; ok {
apply.future = future
delete(s.pending, entry.Index)
}
apply.readIndexes = s.raftFsm.readOnly.getReady(entry.Index)
switch entry.Type {
case proto.EntryNormal:
if len(entry.Data) > 0 {
apply.command = entry.Data
}
case proto.EntryConfChange:
cc := new(proto.ConfChange)
cc.Decode(entry.Data)
apply.command = cc
// repl apply
peerChange := cc.Peer
worked := s.raftFsm.applyConfChange(cc)
if cc.Type == proto.ConfRemoveNode && worked {
if _, ok := s.raftFsm.replicas[peerChange.PeerID]; !ok {
if logger.IsEnableWarn() {
logger.Warn("raft[%v] applying configuration peer [%v] be removed and stop snapshot", s.raftFsm.id, peerChange)
}
s.removeSnapping(peerChange.PeerID)
}
}
s.peerState.change(cc)
if logger.IsEnableWarn() {
logger.Warn("raft[%v] applying configuration change %v.", s.raftFsm.id, cc)
}
}
select {
case <-s.stopc:
if apply.future != nil {
apply.future.respond(nil, ErrStopped)
}
if len(apply.readIndexes) > 0 {
respondReadIndex(apply.readIndexes, ErrStopped)
}
case s.applyc <- apply:
}
}
}
func (s *raft) advance() {
s.raftFsm.raftLog.appliedTo(s.raftFsm.raftLog.committed)
entries := s.raftFsm.raftLog.unstableEntries()
if len(entries) > 0 {
s.raftFsm.raftLog.stableTo(entries[len(entries)-1].Index, entries[len(entries)-1].Term)
}
}
func (s *raft) containsUpdate() bool {
return len(s.raftFsm.raftLog.unstableEntries()) > 0 || s.raftFsm.raftLog.committed > s.raftFsm.raftLog.applied || len(s.raftFsm.msgs) > 0 ||
s.raftFsm.raftLog.committed != s.prevHardSt.Commit || s.raftFsm.term != s.prevHardSt.Term || s.raftFsm.vote != s.prevHardSt.Vote ||
s.raftFsm.readOnly.containsUpdate(s.curApplied.Get())
}
func (s *raft) resetPending(err error) {
if len(s.pending) > 0 {
for k, v := range s.pending {
v.respond(nil, err)
delete(s.pending, k)
}
}
}
func (s *raft) resetTick() {
for {
select {
case <-s.tickc:
default:
return
}
}
}
func (s *raft) resetApply() {
for {
select {
case apply := <-s.applyc:
if apply.future != nil {
apply.future.respond(nil, ErrStopped)
}
if len(apply.readIndexes) > 0 {
respondReadIndex(apply.readIndexes, ErrStopped)
}
pool.returnApply(apply)
default:
return
}
}
}
func (s *raft) getStatus() *Status {
stopped := false
select {
case <-s.stopc:
stopped = true
default:
}
st := &Status{
ID: s.raftFsm.id,
NodeID: s.config.NodeID,
Leader: s.raftFsm.leader,
Term: s.raftFsm.term,
Index: s.raftFsm.raftLog.lastIndex(),
Commit: s.raftFsm.raftLog.committed,
Applied: s.curApplied.Get(),
Vote: s.raftFsm.vote,
State: s.raftFsm.state.String(),
RestoringSnapshot: s.restoringSnapshot.Get(),
PendQueue: len(s.pending),
RecvQueue: len(s.recvc),
AppQueue: len(s.applyc),
Stopped: stopped,
}
if s.raftFsm.state == stateLeader {
st.Replicas = make(map[uint64]*ReplicaStatus)
for id, p := range s.raftFsm.replicas {
st.Replicas[id] = &ReplicaStatus{
Match: p.match,
Commit: p.committed,
Next: p.next,
State: p.state.String(),
Snapshoting: p.state == replicaStateSnapshot,
Paused: p.paused,
Active: p.active,
LastActive: p.lastActive,
Inflight: p.count,
}
}
}
return st
}
func (s *raft) handlePanic(err interface{}) {
fatalStopc <- s.raftFsm.id
fatal := &FatalError{
ID: s.raftFsm.id,
Err: fmt.Errorf("raft[%v] occur panic error: [%v]", s.raftFsm.id, err),
}
s.raftConfig.StateMachine.HandleFatalEvent(fatal)
}
func (s *raft) getPeers() (peers []uint64) {
return s.peerState.get()
}
func (s *raft) readIndex(future *Future) {
if !s.isLeader() {
future.respond(nil, ErrNotLeader)
return
}
select {
case <-s.stopc:
future.respond(nil, ErrStopped)
case s.readIndexC <- future:
}
}
func (s *raft) getEntries(future *Future, startIndex uint64, maxSize uint64) {
req := &entryRequest{
future: future,
index: startIndex,
maxSize: maxSize,
}
select {
case <-s.stopc:
future.respond(nil, ErrStopped)
case s.entryRequestC <- req:
}
}
func (s *raft) getEntriesInLoop(req *entryRequest) {
select {
case <-s.stopc:
req.future.respond(nil, ErrStopped)
return
default:
}
if !s.isLeader() {
req.future.respond(nil, ErrNotLeader)
return
}
if req.index > s.raftFsm.raftLog.lastIndex() {
req.future.respond(nil, nil)
return
}
if req.index < s.raftFsm.raftLog.firstIndex() {
req.future.respond(nil, ErrCompacted)
return
}
entries, err := s.raftFsm.raftLog.entries(req.index, req.maxSize)
req.future.respond(entries, err)
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"math/rand"
"strings"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"time"
)
// CampaignType represents the type of campaigning
// the reason we use the type of string instead of uint64
// is because it's simpler to compare and fill in raft entries
type CampaignType string
// NoLeader is a placeholder nodeID used when there is no leader.
const NoLeader uint64 = 0
// Possible values for CampaignType
const (
// campaignPreElection represents the first phase of a normal election when
// Config.PreVote is true.
campaignPreElection CampaignType = "CampaignPreElection"
// campaignElection represents a normal (time-based) election (the second phase
// of the election when Config.PreVote is true).
campaignElection CampaignType = "CampaignElection"
)
type stepFunc func(r *raftFsm, m *proto.Message)
type raftFsm struct {
id uint64
term uint64
vote uint64
leader uint64
electionElapsed int
heartbeatElapsed int
// randElectionTick is a random number between[electiontimetick, 2 * electiontimetick - 1].
// It gets reset when raft changes its state to follower or candidate.
randElectionTick int
// New configuration is ignored if there exists unapplied configuration.
pendingConf bool
state fsmState
sm StateMachine
config *Config
raftLog *raftLog
rand *rand.Rand
votes map[uint64]bool
acks map[uint64]bool
replicas map[uint64]*replica
readOnly *readOnly
msgs []*proto.Message
step stepFunc
tick func()
stopCh chan struct{}
mo Monitor
//electionFirstBegin is used to mark the begin time of continuous election
//It is valid if and only if mo != nil.
electionFirstBegin time.Time
}
func (fsm *raftFsm) getReplicas() (m string) {
for id := range fsm.replicas {
m += fmt.Sprintf(" [%v] ,", id)
}
return m
}
func newRaftFsm(config *Config, raftConfig *RaftConfig) (*raftFsm, error) {
raftlog, err := newRaftLog(raftConfig.Storage)
if err != nil {
return nil, err
}
hs, err := raftConfig.Storage.InitialState()
if err != nil {
return nil, err
}
r := &raftFsm{
id: raftConfig.ID,
sm: raftConfig.StateMachine,
mo: raftConfig.Monitor,
config: config,
leader: NoLeader,
raftLog: raftlog,
replicas: make(map[uint64]*replica),
readOnly: newReadOnly(raftConfig.ID, config.ReadOnlyOption),
}
r.rand = rand.New(rand.NewSource(int64(config.NodeID + r.id)))
for _, p := range raftConfig.Peers {
r.replicas[p.ID] = newReplica(p, 0)
}
if !hs.IsEmpty() {
if raftConfig.Applied > r.raftLog.lastIndex() {
logger.Info("newRaft[%v] update [applied: %d, to lastindex: %d]", r.id, raftConfig.Applied, raftlog.lastIndex())
raftConfig.Applied = r.raftLog.lastIndex()
}
if hs.Commit > r.raftLog.lastIndex() {
logger.Info("newRaft[%v] update [hardState commit: %d, to lastindex: %d]", r.id, hs.Commit, raftlog.lastIndex())
hs.Commit = r.raftLog.lastIndex()
}
if err := r.loadState(hs); err != nil {
return nil, err
}
}
logger.Info("newRaft[%v] [commit: %d, applied: %d, lastindex: %d]", r.id, raftlog.committed, raftConfig.Applied, raftlog.lastIndex())
if raftConfig.Applied > 0 {
lasti := raftlog.lastIndex()
if lasti == 0 {
// If there is application data but no raft log, then restore to initial state.
raftlog.committed = 0
raftConfig.Applied = 0
} else if lasti < raftConfig.Applied {
// If lastIndex<appliedIndex, then the log as the standard.
raftlog.committed = lasti
raftConfig.Applied = lasti
} else if raftlog.committed < raftConfig.Applied {
raftlog.committed = raftConfig.Applied
}
raftlog.appliedTo(raftConfig.Applied)
}
// recover committed
if err := r.recoverCommit(); err != nil {
return nil, err
}
if raftConfig.Leader == config.NodeID {
if raftConfig.Term != 0 && r.term <= raftConfig.Term {
r.term = raftConfig.Term
r.state = stateLeader
r.becomeLeader()
r.bcastAppend()
} else {
r.becomeFollower(r.term, NoLeader)
}
} else {
if raftConfig.Leader == NoLeader {
r.becomeFollower(r.term, NoLeader)
} else {
r.becomeFollower(raftConfig.Term, raftConfig.Leader)
}
}
if logger.IsEnableDebug() {
peerStrs := make([]string, 0)
for _, p := range r.peers() {
peerStrs = append(peerStrs, fmt.Sprintf("%v", p.String()))
}
logger.Debug("newRaft[%v] [peers: [%s], term: %d, commit: %d, applied: %d, lastindex: %d, lastterm: %d]",
r.id, strings.Join(peerStrs, ","), r.term, r.raftLog.committed, r.raftLog.applied, r.raftLog.lastIndex(), r.raftLog.lastTerm())
}
r.stopCh = make(chan struct{}, 1)
go r.doRandomSeed()
return r, nil
}
func (r *raftFsm) doRandomSeed() {
ticker := time.Tick(time.Duration(rand.Intn(5)) * time.Second)
for {
select {
case <-ticker:
r.rand.Seed(time.Now().UnixNano())
case <-r.stopCh:
return
}
}
}
func (r *raftFsm) StopFsm() {
peers := make([]proto.Peer, len(r.replicas))
for _, r := range r.replicas {
peers = append(peers, r.peer)
}
if r.mo != nil {
r.mo.RemovePartition(r.id, peers)
}
close(r.stopCh)
}
// raft main method
func (r *raftFsm) Step(m *proto.Message) {
if m.Type == proto.LocalMsgHup {
if r.state != stateLeader && r.promotable() {
ents, err := r.raftLog.slice(r.raftLog.applied+1, r.raftLog.committed+1, noLimit)
if err != nil {
errMsg := fmt.Sprintf("[raft->Step][%v]unexpected error getting unapplied entries:[%v]", r.id, err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
if n := numOfPendingConf(ents); n != 0 && r.raftLog.committed > r.raftLog.applied {
if logger.IsEnableWarn() {
logger.Warn("[raft->Step][%v] cannot campaign at term %d since there are still %d pending configuration changes to apply.", r.id, r.term, n)
}
return
}
if logger.IsEnableInfo() {
logger.Info("[raft->Step][%v] is starting a new election at term[%d].", r.id, r.term)
}
// only transfer leader will set forceVote=true.
// Leadership transfers never use pre-vote even if r.preVote is true; we
// know we are not recovering from a partition so there is no need for the
// extra round trip.
if r.config.PreVote && !m.ForceVote {
r.campaign(m.ForceVote, campaignPreElection)
} else {
r.campaign(m.ForceVote, campaignElection)
}
} else if logger.IsEnableDebug() && r.state == stateLeader {
logger.Debug("[raft->Step][%v] ignoring LocalMsgHup because already leader.", r.id)
} else if logger.IsEnableDebug() {
var replicas []uint64
for id := range r.replicas {
replicas = append(replicas, id)
}
logger.Debug("[raft->Step][%v] state %v, replicas %v.", r.id, r.state, replicas)
}
return
}
switch {
case m.Term == 0:
// local message
case m.Term > r.term:
if logger.IsEnableDebug() {
logger.Debug("[raft->Step][%v term: %d] received a [%s] message with higher term from [%v term: %d],ForceVote[%v].",
r.id, r.term, m.Type, m.From, m.Term, m.ForceVote)
}
if m.Type == proto.ReqMsgVote || m.Type == proto.ReqMsgPreVote {
inLease := r.config.LeaseCheck && r.leader != NoLeader
if r.leader != m.From && inLease && !m.ForceVote && r.electionElapsed < r.randElectionTick {
if logger.IsEnableWarn() {
logger.Warn("[raft->Step][%v logterm: %d, index: %d, vote: %v] ignored %v from %v [logterm: %d, index: %d] at term %d: lease is not expired.",
r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
}
return
}
}
switch {
case m.Type == proto.ReqMsgPreVote:
// Never change our term in response to a PreVote
case m.Type == proto.RespMsgPreVote && !m.Reject:
// We send pre-vote requests with a term in our future. If the
// pre-vote is granted, we will increment our term when we get a
// quorum. If it is not, the term comes from the node that
// rejected our vote so we should become a follower at the new
// term.
default:
if logger.IsEnableDebug() {
logger.Debug("[raft->Step][%x,%d] [term: %d] received a %s message with higher term from %x [term: %d]",
r.id, r.config.ReplicateAddr, r.term, m.Type, m.From, m.Term)
}
if m.Type == proto.ReqMsgAppend || m.Type == proto.ReqMsgHeartBeat || m.Type == proto.ReqMsgSnapShot {
r.becomeFollower(m.Term, m.From)
} else {
r.becomeFollower(m.Term, NoLeader)
}
}
case m.Term < r.term:
if (r.config.LeaseCheck || r.config.PreVote) && (m.Type == proto.ReqMsgHeartBeat || m.Type == proto.ReqMsgAppend) {
// We have received messages from a leader at a lower term. It is possible
// that these messages were simply delayed in the network, but this could
// also mean that this node has advanced its term number during a network
// partition, and it is now unable to either win an election or to rejoin
// the majority on the old term. If checkQuorum is false, this will be
// handled by incrementing term numbers in response to MsgVote with a
// higher term, but if checkQuorum is true we may not advance the term on
// MsgVote and must generate other messages to advance the term. The net
// result of these two features is to minimize the disruption caused by
// nodes that have been removed from the cluster's configuration: a
// removed node will send MsgVotes (or MsgPreVotes) which will be ignored,
// but it will not receive MsgApp or MsgHeartbeat, so it will not create
// disruptive term increases, by notifying leader of this node's activeness.
// The above comments also true for Pre-Vote
//
// When follower gets isolated, it soon starts an election ending
// up with a higher term than leader, although it won't receive enough
// votes to win the election. When it regains connectivity, this response
// with "proto.MsgAppResp" of higher term would force leader to step down.
// However, this disruption is inevitable to free this stuck node with
// fresh election. This can be prevented with Pre-Vote phase.
r.send(&proto.Message{To: m.From, Term: r.term, Type: proto.RespMsgAppend})
} else if m.Type == proto.ReqMsgPreVote {
// Before Pre-Vote enable, there may have candidate with higher term,
// but less log. After update to Pre-Vote, the cluster may deadlock if
// we drop messages with a lower term.
if logger.IsEnableInfo() {
logger.Info("%x [logterm: %d, index: %d, vote: %x] rejected %s from %x [logterm: %d, index: %d] at term %d",
r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
}
r.send(&proto.Message{To: m.From, Term: r.term, Type: proto.RespMsgPreVote, Reject: true})
} else {
// ignore other cases
if logger.IsEnableInfo() {
logger.Info("%x [term: %d] ignored a %s message with lower term from %x [term: %d]",
r.id, r.term, m.Type, m.From, m.Term)
}
}
return
}
if m.Type == proto.ReqMsgPreVote || m.Type == proto.ReqMsgVote {
// We can vote if this is a repeat of a vote we've already cast...
canVote := r.vote == m.From ||
// ...we haven't voted and we don't think there's a leader yet in this term...
(r.vote == NoLeader && r.leader == NoLeader) ||
// ...or this is a PreVote for a future term...
(m.Type == proto.ReqMsgPreVote && m.Term > r.term)
// ...and we believe the candidate is up to date.
var respType proto.MsgType
if m.Type == proto.ReqMsgPreVote {
respType = proto.RespMsgPreVote
} else {
respType = proto.RespMsgVote
}
if canVote && r.raftLog.isUpToDate(m.Index, m.LogTerm, 0, 0) {
// Note: it turns out that that learners must be allowed to cast votes.
// This seems counter- intuitive but is necessary in the situation in which
// a learner has been promoted (i.e. is now a voter) but has not learned
// about this yet.
// For example, consider a group in which id=1 is a learner and id=2 and
// id=3 are voters. A configuration change promoting 1 can be committed on
// the quorum `{2,3}` without the config change being appended to the
// learner's log. If the leader (say 2) fails, there are de facto two
// voters remaining. Only 3 can win an election (due to its log containing
// all committed entries), but to do so it will need 1 to vote. But 1
// considers itself a learner and will continue to do so until 3 has
// stepped up as leader, replicates the conf change to 1, and 1 applies it.
// Ultimately, by receiving a request to vote, the learner realizes that
// the candidate believes it to be a voter, and that it should act
// accordingly. The candidate's config may be stale, too; but in that case
// it won't win the election, at least in the absence of the bug discussed
// in:
// https://github.com/etcd-io/etcd/issues/7625#issuecomment-488798263.
if logger.IsEnableDebug() {
logger.Info("%x [logterm: %d, index: %d, vote: %x] cast %s for %x [logterm: %d, index: %d] at term %d",
r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
}
// When responding to Msg{Pre,}Vote messages we include the term
// from the message, not the local term. To see why, consider the
// case where a single node was previously partitioned away and
// it's local term is now out of date. If we include the local term
// (recall that for pre-votes we don't update the local term), the
// (pre-)campaigning node on the other end will proceed to ignore
// the message (it ignores all out of date messages).
// The term in the original message and current local term are the
// same in the case of regular votes, but different for pre-votes.
r.send(&proto.Message{To: m.From, Term: m.Term, Type: respType})
if m.Type == proto.ReqMsgVote {
// Only record real votes.
r.electionElapsed = 0
r.vote = m.From
}
} else {
if logger.IsEnableDebug() {
logger.Info("%x [logterm: %d, index: %d, vote: %x] rejected %s from %x [logterm: %d, index: %d] at term %d",
r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
}
r.send(&proto.Message{To: m.From, Term: r.term, Type: respType, Reject: true})
}
return
}
r.step(r, m)
}
func (r *raftFsm) loadState(state proto.HardState) error {
if state.Commit < r.raftLog.committed || state.Commit > r.raftLog.lastIndex() {
return fmt.Errorf("[raft->loadState][%v] state.commit %d is out of range [%d, %d]", r.id, state.Commit, r.raftLog.committed, r.raftLog.lastIndex())
}
r.term = state.Term
r.vote = state.Vote
r.raftLog.committed = state.Commit
return nil
}
func (r *raftFsm) recoverCommit() error {
for r.raftLog.applied <= r.raftLog.committed {
committedEntries := r.raftLog.nextEnts(64 * MB)
for _, entry := range committedEntries {
r.raftLog.appliedTo(entry.Index)
switch entry.Type {
case proto.EntryNormal:
if entry.Data == nil || len(entry.Data) == 0 {
continue
}
if _, err := r.sm.Apply(entry.Data, entry.Index); err != nil {
return err
}
case proto.EntryConfChange:
cc := new(proto.ConfChange)
cc.Decode(entry.Data)
if _, err := r.sm.ApplyMemberChange(cc, entry.Index); err != nil {
return err
}
r.applyConfChange(cc)
}
}
if r.raftLog.applied == r.raftLog.committed {
break
}
}
return nil
}
func (r *raftFsm) applyConfChange(cc *proto.ConfChange) (ok bool) {
if cc.Peer.ID == NoLeader {
r.pendingConf = false
return
}
switch cc.Type {
case proto.ConfAddNode:
r.addPeer(cc.Peer)
case proto.ConfRemoveNode:
return r.removePeer(cc.Peer)
case proto.ConfUpdateNode:
r.updatePeer(cc.Peer)
}
return
}
func (r *raftFsm) addPeer(peer proto.Peer) {
r.pendingConf = false
if _, ok := r.replicas[peer.ID]; !ok {
if r.state == stateLeader {
r.replicas[peer.ID] = newReplica(peer, r.config.MaxInflightMsgs)
r.replicas[peer.ID].next = r.raftLog.lastIndex() + 1
} else {
r.replicas[peer.ID] = newReplica(peer, 0)
}
}
}
func (r *raftFsm) removePeer(peer proto.Peer) (ok bool) {
r.pendingConf = false
replica, ok := r.replicas[peer.ID]
if !ok {
return
} else if replica.peer.PeerID != peer.PeerID {
if logger.IsEnableInfo() {
logger.Info("raft[%v] ignore remove peer[%v], current[%v]", r.id, peer.String(), replica.peer.String())
}
return
}
delete(r.replicas, peer.ID)
ok = true
if peer.ID == r.config.NodeID {
r.becomeFollower(r.term, NoLeader)
} else if r.state == stateLeader && len(r.replicas) > 0 {
if r.maybeCommit() {
r.bcastAppend()
}
}
return
}
func (r *raftFsm) updatePeer(peer proto.Peer) {
r.pendingConf = false
if _, ok := r.replicas[peer.ID]; ok {
r.replicas[peer.ID].peer = peer
}
}
func (r *raftFsm) quorum() int {
return len(r.replicas)/2 + 1
}
func (r *raftFsm) send(m *proto.Message) {
m.ID = r.id
m.From = r.config.NodeID
// ReqMsgPreVote's message should add one
if m.Type != proto.LocalMsgProp && m.Type != proto.ReqMsgPreVote {
m.Term = r.term
}
r.msgs = append(r.msgs, m)
}
func (r *raftFsm) reset(term, lasti uint64, isLeader bool) {
if r.term != term {
r.term = term
r.vote = NoLeader
}
r.leader = NoLeader
r.electionElapsed = 0
r.heartbeatElapsed = 0
r.votes = make(map[uint64]bool)
r.pendingConf = false
r.readOnly.reset(ErrNotLeader)
if isLeader {
r.randElectionTick = r.config.ElectionTick - 1
for id, p := range r.replicas {
r.replicas[id] = newReplica(p.peer, r.config.MaxInflightMsgs)
r.replicas[id].next = lasti + 1
if id == r.config.NodeID {
r.replicas[id].match = lasti
r.replicas[id].committed = r.raftLog.committed
}
}
} else {
r.resetRandomizedElectionTimeout()
for id, p := range r.replicas {
r.replicas[id] = newReplica(p.peer, 0)
}
}
}
func (r *raftFsm) resetRandomizedElectionTimeout() {
randTick := r.rand.Intn(r.config.ElectionTick)
r.randElectionTick = r.config.ElectionTick + randTick
logger.Debug("raft[%v,%v] random election timeout randElectionTick=%v, config.ElectionTick=%v, randTick=%v",
r.id, r.config.ReplicateAddr, r.randElectionTick, r.config.ElectionTick, randTick)
}
func (r *raftFsm) pastElectionTimeout() bool {
return r.electionElapsed >= r.randElectionTick
}
func (r *raftFsm) peers() []proto.Peer {
peers := make([]proto.Peer, 0, len(r.replicas))
for _, p := range r.replicas {
peers = append(peers, p.peer)
}
return peers
}
func (r *raftFsm) checkSnapshot(meta proto.SnapshotMeta) bool {
if meta.Index <= r.raftLog.committed {
return false
}
if r.raftLog.matchTerm(meta.Index, meta.Term) {
r.raftLog.commitTo(meta.Index)
return false
}
return true
}
func (r *raftFsm) restore(meta proto.SnapshotMeta) {
if logger.IsEnableWarn() {
logger.Warn("raft [%v, commit: %d, lastindex: %d, lastterm: %d] starts to restore snapshot [index: %d,term:%d]",
r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), meta.Index, meta.Term)
}
r.raftLog.restore(meta.Index)
r.replicas = make(map[uint64]*replica)
for _, p := range meta.Peers {
r.replicas[p.ID] = newReplica(p, 0)
}
}
func (r *raftFsm) addReadIndex(futures []*Future) {
// not leader
if r.leader != r.config.NodeID {
respondReadIndex(futures, ErrNotLeader)
return
}
// check leader commit in current term
if !r.readOnly.committed {
if r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(r.raftLog.committed)) == r.term {
r.readOnly.commit(r.raftLog.committed)
}
}
r.readOnly.add(r.raftLog.committed, futures)
r.bcastReadOnly()
}
func numOfPendingConf(ents []*proto.Entry) int {
n := 0
for i := range ents {
if ents[i].Type == proto.EntryConfChange {
n++
}
}
return n
}
func (r *raftFsm) monitorElection() {
if r.mo == nil {
return
}
now := time.Now()
if r.electionFirstBegin.IsZero() || r.state != stateCandidate {
//Record the time of the most recent lost of leader.
r.electionFirstBegin = now
return
}
//call r.mo.MonitorElection when r.leader==NoLeader continuously
r.mo.MonitorElection(r.id, r.getReplicas(), now.Sub(r.electionFirstBegin))
}
func (r *raftFsm) monitorZombie(peer *replica) {
if r.mo == nil {
return
}
now := time.Now()
if peer.lastZombie.Before(peer.lastActive) {
peer.lastZombie = now
}
if du := now.Sub(peer.lastZombie); du > 2*r.config.TickInterval {
r.mo.MonitorZombie(r.id, peer.peer, r.getReplicas(), du)
}
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
// first become preCandidate,
func (r *raftFsm) becomeCandidate() {
if r.state == stateLeader {
panic(AppPanicError(fmt.Sprintf("[raft->becomeCandidate][%v] invalid transition [leader -> candidate].", r.id)))
}
r.monitorElection()
r.step = stepCandidate
r.reset(r.term+1, 0, false)
r.tick = r.tickElection
r.vote = r.config.NodeID
r.state = stateCandidate
if logger.IsEnableDebug() {
logger.Debug("raft[%v] became candidate at term %d.", r.id, r.config.TransportConfig.ReplicateAddr, r.term)
}
}
func stepCandidate(r *raftFsm, m *proto.Message) {
switch m.Type {
case proto.LocalMsgProp:
if logger.IsEnableDebug() {
logger.Debug("raft[%v] no leader at term %d; dropping proposal.", r.id, r.term)
}
proto.ReturnMessage(m)
return
case proto.ReqMsgAppend:
r.becomeFollower(r.term, m.From)
r.handleAppendEntries(m)
proto.ReturnMessage(m)
return
case proto.ReqMsgHeartBeat:
r.becomeFollower(r.term, m.From)
return
case proto.ReqMsgPreVote:
r.becomeFollower(r.term, m.From)
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgPreVote
nmsg.To = m.From
r.send(nmsg)
proto.ReturnMessage(m)
return
case proto.ReqMsgVote:
if logger.IsEnableDebug() {
logger.Debug("raft[%v] [logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
}
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgVote
nmsg.To = m.From
nmsg.Reject = true
r.send(nmsg)
proto.ReturnMessage(m)
return
case proto.RespMsgVote:
gr := r.poll(m.From, !m.Reject)
if logger.IsEnableDebug() {
logger.Debug("raft[%v] [q:%d] has received %d votes and %d vote rejections.", r.id, r.quorum(), gr, len(r.votes)-gr)
}
switch r.quorum() {
case gr:
r.becomeLeader()
r.bcastAppend()
case len(r.votes) - gr:
r.becomeFollower(r.term, NoLeader)
}
}
}
func (r *raftFsm) campaign(force bool, t CampaignType) {
var msgType proto.MsgType
var term uint64
if t == campaignPreElection {
r.becomePreCandidate()
msgType = proto.ReqMsgPreVote
term = r.term + 1
} else {
r.becomeCandidate()
msgType = proto.ReqMsgVote
}
if r.quorum() == r.poll(r.config.NodeID, true) {
if t == campaignPreElection {
r.campaign(force, campaignElection)
} else {
r.becomeLeader()
}
return
}
for id := range r.replicas {
if id == r.config.NodeID {
continue
}
li, lt := r.raftLog.lastIndexAndTerm()
if logger.IsEnableDebug() {
logger.Debug("[raft->campaign][%v,%v logterm: %d, index: %d] sent "+
"%v request to %v at term %d. raftFSM[%p]", msgType, r.id, r.config.ReplicateAddr, lt, li, id, r.term, r)
}
m := proto.GetMessage()
m.To = id
m.Type = msgType
m.ForceVote = force
m.Index = li
m.LogTerm = lt
m.Term = term
r.send(m)
}
}
func (r *raftFsm) poll(id uint64, v bool) (granted int) {
if logger.IsEnableDebug() {
if v {
logger.Debug("raft[%v,%v] received vote from %v at term %d.", r.id, r.config.ReplicateAddr, id, r.term)
} else {
logger.Debug("raft[%v,%v] received vote rejection from %v at term %d.", r.id, r.config.ReplicateAddr, id, r.term)
}
}
if _, ok := r.votes[id]; !ok {
r.votes[id] = v
}
for _, vv := range r.votes {
if vv {
granted++
}
}
return granted
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"math"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
func (r *raftFsm) becomeFollower(term, lead uint64) {
r.step = stepFollower
r.reset(term, 0, false)
r.tick = r.tickElection
r.leader = lead
r.state = stateFollower
if logger.IsEnableDebug() {
logger.Debug("[raft][%v,%v] became follower at term[%d] leader[%d].", r.id, r.config.ReplicateAddr, r.term, r.leader)
}
}
func stepFollower(r *raftFsm, m *proto.Message) {
switch m.Type {
case proto.LocalMsgProp:
if r.leader == NoLeader {
if logger.IsEnableWarn() {
logger.Warn("raft[%v] no leader at term %d; dropping proposal.", r.id, r.term)
}
return
}
m.To = r.leader
r.send(m)
return
case proto.ReqMsgAppend:
r.electionElapsed = 0
r.leader = m.From
r.handleAppendEntries(m)
proto.ReturnMessage(m)
return
case proto.ReqMsgHeartBeat:
r.electionElapsed = 0
r.leader = m.From
return
case proto.ReqMsgPreVote:
r.electionElapsed = 0
r.leader = m.From
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgPreVote
nmsg.To = m.From
r.send(nmsg)
proto.ReturnMessage(m)
return
case proto.ReqCheckQuorum:
// TODO: remove this
if logger.IsEnableDebug() {
logger.Debug("raft[%d] recv check quorum from %d, index=%d", r.id, m.From, m.Index)
}
r.electionElapsed = 0
r.leader = m.From
nmsg := proto.GetMessage()
nmsg.Type = proto.RespCheckQuorum
nmsg.Index = m.Index
nmsg.To = m.From
r.send(nmsg)
proto.ReturnMessage(m)
return
case proto.ReqMsgVote:
fpri, lpri := uint16(math.MaxUint16), uint16(0)
if pr, ok := r.replicas[m.From]; ok {
fpri = pr.peer.Priority
}
if pr, ok := r.replicas[r.config.NodeID]; ok {
lpri = pr.peer.Priority
}
if (!r.config.LeaseCheck || r.leader == NoLeader) && (r.vote == NoLeader || r.vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm, fpri, lpri) {
r.electionElapsed = 0
if logger.IsEnableDebug() {
logger.Debug("raft[%v] [logterm: %d, index: %d, vote: %v] voted for %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
}
r.vote = m.From
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgVote
nmsg.To = m.From
r.send(nmsg)
} else {
if logger.IsEnableDebug() {
logger.Debug("raf[%v] [logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
}
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgVote
nmsg.To = m.From
nmsg.Reject = true
r.send(nmsg)
}
proto.ReturnMessage(m)
return
case proto.LeaseMsgTimeout:
if r.leader == m.From {
r.electionElapsed = 0
nmsg := proto.GetMessage()
nmsg.Type = proto.LocalMsgHup
nmsg.From = r.config.NodeID
r.Step(nmsg)
}
proto.ReturnMessage(m)
return
}
}
func (r *raftFsm) tickElection() {
if !r.promotable() {
r.electionElapsed = 0
return
}
r.electionElapsed++
timeout := false
// check follower lease (2 * electiontimeout)
if r.config.LeaseCheck && r.leader != NoLeader && r.state == stateFollower {
timeout = (r.electionElapsed >= (r.config.ElectionTick << 1))
} else {
timeout = r.pastElectionTimeout()
}
if timeout {
r.electionElapsed = 0
m := proto.GetMessage()
m.Type = proto.LocalMsgHup
m.From = r.config.NodeID
r.Step(m)
}
}
func (r *raftFsm) handleAppendEntries(m *proto.Message) {
if m.Index < r.raftLog.committed {
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgAppend
nmsg.To = m.From
nmsg.Index = r.raftLog.committed
nmsg.Commit = r.raftLog.committed
r.send(nmsg)
return
}
if mlastIndex, ok := r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...); ok {
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgAppend
nmsg.To = m.From
nmsg.Index = mlastIndex
nmsg.Commit = r.raftLog.committed
r.send(nmsg)
} else {
if logger.IsEnableDebug() {
logger.Debug("raft[%v logterm: %d, index: %d] rejected msgApp [logterm: %d, index: %d] from %v",
r.id, r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(m.Index)), m.Index, m.LogTerm, m.Index, m.From)
}
// Return a hint to the leader about the maximum index and term that the
// two logs could be divergent at. Do this by searching through the
// follower's log for the maximum (index, term) pair with a term <= the
// MsgApp's LogTerm and an index <= the MsgApp's Index. This can help
// skip all indexes in the follower's uncommitted tail with terms
// greater than the MsgApp's LogTerm.
//
// See the other caller for findConflictByTerm (in stepLeader) for a much
// more detailed explanation of this mechanism.
hintIndex := util.Min(m.Index, r.raftLog.lastIndex())
hintIndex = r.raftLog.findConflictByTerm(hintIndex, m.LogTerm)
hintTerm, err := r.raftLog.term(hintIndex)
if err != nil {
panic(fmt.Sprintf("term(%d) must be valid, but got %v", hintIndex, err))
}
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgAppend
nmsg.To = m.From
nmsg.Index = m.Index
nmsg.Commit = r.raftLog.committed
nmsg.Reject = true
nmsg.LogTerm = hintTerm
nmsg.RejectHint = hintIndex
r.send(nmsg)
}
}
func (r *raftFsm) promotable() bool {
// todo check snapshot
pr, ok := r.replicas[r.config.NodeID]
return ok && pr.state != replicaStateSnapshot
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"sort"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
func (r *raftFsm) becomeLeader() {
if r.state == stateFollower {
panic(AppPanicError(fmt.Sprintf("[raft->becomeLeader][%v] invalid transition [follower -> leader].", r.id)))
}
r.recoverCommit()
lasti := r.raftLog.lastIndex()
r.step = stepLeader
r.reset(r.term, lasti, true)
r.tick = r.tickHeartbeat
r.leader = r.config.NodeID
r.state = stateLeader
r.acks = nil
if pr, ok := r.replicas[r.config.NodeID]; ok {
pr.active = true
}
ents, err := r.raftLog.entries(r.raftLog.committed+1, noLimit)
if err != nil {
errMsg := fmt.Sprintf("[raft->becomeLeader][%v] unexpected error getting uncommitted entries (%v).", r.id, err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
nconf := numOfPendingConf(ents)
if nconf > 1 {
panic(AppPanicError(fmt.Sprintf("[raft->becomeLeader][%v] unexpected double uncommitted config entry.", r.id)))
}
if nconf == 1 {
r.pendingConf = true
}
r.appendEntry(&proto.Entry{Term: r.term, Index: lasti + 1, Data: nil})
if logger.IsEnableDebug() {
logger.Debug("raft[%v,%v] became leader at term %d.index:%d", r.id, r.config.ReplicateAddr, r.term, lasti+1)
}
}
func stepLeader(r *raftFsm, m *proto.Message) {
// These message types do not require any progress for m.From.
switch m.Type {
case proto.LocalMsgProp:
if _, ok := r.replicas[r.config.NodeID]; !ok || len(m.Entries) == 0 {
return
}
for i, e := range m.Entries {
if e.Type == proto.EntryConfChange {
if r.pendingConf {
m.Entries[i] = &proto.Entry{Term: e.Term, Index: e.Index, Type: proto.EntryNormal}
}
r.pendingConf = true
}
}
r.appendEntry(m.Entries...)
r.bcastAppend()
proto.ReturnMessage(m)
return
case proto.ReqMsgVote:
if logger.IsEnableDebug() {
logger.Debug("[raft->stepLeader][%v logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d",
r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
}
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgVote
nmsg.To = m.From
nmsg.Reject = true
r.send(nmsg)
proto.ReturnMessage(m)
return
}
// All other message types require a progress for m.From (pr).
pr, prOk := r.replicas[m.From]
if !prOk {
if logger.IsEnableDebug() {
logger.Debug("[raft->stepLeader][%v] no progress available for %v.", r.id, m.From)
}
return
}
switch m.Type {
case proto.RespMsgAppend:
pr.active = true
pr.lastActive = time.Now()
if m.Reject {
if logger.IsEnableDebug() {
logger.Debug("raft[%v, %v, %v, %v] received msgApp rejection(lastindex: %d) from %v for index %d commit %v. replica info [%v,%v,%v,%v]",
r.id, r.raftLog.firstIndex(), r.raftLog.lastIndex(), r.raftLog.committed, m.RejectHint, m.From, m.Index, m.Commit, pr.state, pr.next, pr.committed, pr.match)
}
nextProbeIdx := m.RejectHint
if m.LogTerm > 0 {
// If the follower has an uncommitted log tail, we would end up
// probing one by one until we hit the common prefix.
//
// For example, if the leader has:
//
// idx 1 2 3 4 5 6 7 8 9
// -----------------
// term (L) 1 3 3 3 5 5 5 5 5
// term (F) 1 1 1 1 2 2
//
// Then, after sending an append anchored at (idx=9,term=5) we
// would receive a RejectHint of 6 and LogTerm of 2. Without the
// code below, we would try an append at index 6, which would
// fail again.
//
// However, looking only at what the leader knows about its own
// log and the rejection hint, it is clear that a probe at index
// 6, 5, 4, 3, and 2 must fail as well:
//
// For all of these indexes, the leader's log term is larger than
// the rejection's log term. If a probe at one of these indexes
// succeeded, its log term at that index would match the leader's,
// i.e. 3 or 5 in this example. But the follower already told the
// leader that it is still at term 2 at index 9, and since the
// log term only ever goes up (within a log), this is a contradiction.
//
// At index 1, however, the leader can draw no such conclusion,
// as its term 1 is not larger than the term 2 from the
// follower's rejection. We thus probe at 1, which will succeed
// in this example. In general, with this approach we probe at
// most once per term found in the leader's log.
//
// There is a similar mechanism on the follower (implemented in
// handleAppendEntries via a call to findConflictByTerm) that is
// useful if the follower has a large divergent uncommitted log
// tail[1], as in this example:
//
// idx 1 2 3 4 5 6 7 8 9
// -----------------
// term (L) 1 3 3 3 3 3 3 3 7
// term (F) 1 3 3 4 4 5 5 5 6
//
// Naively, the leader would probe at idx=9, receive a rejection
// revealing the log term of 6 at the follower. Since the leader's
// term at the previous index is already smaller than 6, the leader-
// side optimization discussed above is ineffective. The leader thus
// probes at index 8 and, naively, receives a rejection for the same
// index and log term 5. Again, the leader optimization does not improve
// over linear probing as term 5 is above the leader's term 3 for that
// and many preceding indexes; the leader would have to probe linearly
// until it would finally hit index 3, where the probe would succeed.
//
// Instead, we apply a similar optimization on the follower. When the
// follower receives the probe at index 8 (log term 3), it concludes
// that all of the leader's log preceding that index has log terms of
// 3 or below. The largest index in the follower's log with a log term
// of 3 or below is index 3. The follower will thus return a rejection
// for index=3, log term=3 instead. The leader's next probe will then
// succeed at that index.
//
// [1]: more precisely, if the log terms in the large uncommitted
// tail on the follower are larger than the leader's. At first,
// it may seem unintuitive that a follower could even have such
// a large tail, but it can happen:
//
// 1. Leader appends (but does not commit) entries 2 and 3, crashes.
// idx 1 2 3 4 5 6 7 8 9
// -----------------
// term (L) 1 2 2 [crashes]
// term (F) 1
// term (F) 1
//
// 2. a follower becomes leader and appends entries at term 3.
// -----------------
// term (x) 1 2 2 [down]
// term (F) 1 3 3 3 3
// term (F) 1
//
// 3. term 3 leader goes down, term 2 leader returns as term 4
// leader. It commits the log & entries at term 4.
//
// -----------------
// term (L) 1 2 2 2
// term (x) 1 3 3 3 3 [down]
// term (F) 1
// -----------------
// term (L) 1 2 2 2 4 4 4
// term (F) 1 3 3 3 3 [gets probed]
// term (F) 1 2 2 2 4 4 4
//
// 4. the leader will now probe the returning follower at index
// 7, the rejection points it at the end of the follower's log
// which is at a higher log term than the actually committed
// log.
nextProbeIdx = r.raftLog.findConflictByTerm(m.RejectHint, m.LogTerm)
}
if pr.maybeDecrTo(m.Index, nextProbeIdx, m.Commit) {
if logger.IsEnableDebug() {
logger.Debug("[%v] decreased progress of [%v] to [%s]", r.id, m.From, pr)
}
if pr.state == replicaStateReplicate {
pr.becomeProbe()
}
r.sendAppend(m.From)
}
} else {
oldPaused := pr.isPaused()
if pr.maybeUpdate(m.Index, m.Commit) {
switch {
case pr.state == replicaStateProbe:
pr.becomeReplicate()
case pr.state == replicaStateSnapshot && pr.needSnapshotAbort():
if logger.IsEnableWarn() {
logger.Warn("raft[%v] snapshot aborted, resumed sending replication messages to %v.", r.id, m.From)
}
pr.becomeProbe()
case pr.state == replicaStateReplicate:
pr.inflight.freeTo(m.Index)
}
if r.maybeCommit() {
r.bcastAppend()
} else if oldPaused {
r.sendAppend(m.From)
}
}
}
proto.ReturnMessage(m)
return
case proto.RespMsgHeartBeat:
if pr.state == replicaStateReplicate && pr.inflight.full() {
pr.inflight.freeFirstOne()
}
if !pr.pending && (pr.match < r.raftLog.lastIndex() || pr.committed < r.raftLog.committed) {
r.sendAppend(m.From)
}
pr.active = true
pr.lastActive = time.Now()
if pr.state != replicaStateSnapshot {
pr.pending = false
}
return
case proto.LeaseMsgOffline:
for id := range r.replicas {
if id == r.config.NodeID {
continue
}
nmsg := proto.GetMessage()
nmsg.Type = proto.LeaseMsgTimeout
nmsg.To = id
r.send(nmsg)
}
logger.Debug("[raft][%v] LeaseMsgOffline at term[%d] leader[%d].", r.id, r.term, r.leader)
r.becomeFollower(r.term, NoLeader)
proto.ReturnMessage(m)
return
case proto.RespMsgSnapShot:
if pr.state != replicaStateSnapshot {
return
}
if m.Reject {
if logger.IsEnableWarn() {
logger.Warn("raft[%v] send snapshot to [%v] failed.", r.id, m.From)
}
pr.snapshotFailure()
pr.becomeProbe()
} else {
pr.active = true
pr.lastActive = time.Now()
pr.becomeProbe()
if logger.IsEnableWarn() {
logger.Warn("raft[%v] send snapshot to [%v] succeeded, resumed replication [%s]", r.id, m.From, pr)
}
}
// If snapshot finish, wait for the RespMsgAppend from the remote node before sending out the next ReqMsgAppend.
// If snapshot failure, wait for a heartbeat interval before next try.
pr.pause()
proto.ReturnMessage(m)
return
case proto.RespCheckQuorum:
// TODO: remove this when stable
if logger.IsEnableDebug() {
logger.Debug("raft[%d] recv check quorum resp from %d, index=%d", r.id, m.From, m.Index)
}
r.readOnly.recvAck(m.Index, m.From, r.quorum())
proto.ReturnMessage(m)
return
}
}
func (r *raftFsm) becomePreCandidate() {
r.acks = make(map[uint64]bool)
r.acks[r.config.NodeID] = true
logger.Debug("raft[%v] became preCandidate at term %d.", r.id, r.term)
r.step = stepPreCandidate
r.reset(r.term, 0, false)
r.tick = r.tickElectionAck
r.state = statePreCandidate
}
func stepPreCandidate(r *raftFsm, m *proto.Message) {
switch m.Type {
case proto.LocalMsgProp:
if logger.IsEnableDebug() {
logger.Debug("raft[%v] no leader at term %d; dropping proposal", r.id, r.term)
}
proto.ReturnMessage(m)
return
case proto.ReqMsgAppend:
if logger.IsEnableDebug() {
logger.Debug("raft[%v] PreCandidate receive append in term %d; become follower.", r.id, r.term)
}
r.becomeFollower(r.term, m.From)
r.handleAppendEntries(m)
proto.ReturnMessage(m)
return
case proto.ReqMsgHeartBeat:
if logger.IsEnableDebug() {
logger.Debug("raft[%v] PreCandidate receive heartbeat in term %d; become follower.", r.id, r.term)
}
r.becomeFollower(r.term, m.From)
return
case proto.ReqMsgPreVote:
r.becomeFollower(r.term, m.From)
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgPreVote
nmsg.To = m.From
r.send(nmsg)
proto.ReturnMessage(m)
return
case proto.RespCheckQuorum:
// TODO: remove this when stable
if logger.IsEnableDebug() {
logger.Debug("raft[%d] recv check quorum resp from %d, index=%d", r.id, m.From, m.Index)
}
r.readOnly.recvAck(m.Index, m.From, r.quorum())
proto.ReturnMessage(m)
return
case proto.ReqMsgVote:
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgVote
nmsg.To = m.From
nmsg.Reject = true
r.send(nmsg)
proto.ReturnMessage(m)
return
case proto.RespMsgPreVote:
gr := r.poll(m.From, !m.Reject)
if logger.IsEnableDebug() {
logger.Debug("raft[%v] [q:%d] stepPreCandidate has received %d votes and %d vote rejections.", r.id, r.quorum(), gr, len(r.votes)-gr)
}
switch r.quorum() {
case gr:
r.campaign(false, campaignElection)
case len(r.votes) - gr:
r.becomeFollower(r.term, NoLeader)
}
return
}
}
func (r *raftFsm) tickHeartbeat() {
r.heartbeatElapsed++
r.electionElapsed++
if r.pastElectionTimeout() {
r.electionElapsed = 0
if r.config.LeaseCheck && !r.checkLeaderLease() {
if logger.IsEnableWarn() {
logger.Warn("raft[%v] stepped down to follower since quorum is not active.", r.id)
}
logger.Debug("[raft][%v] heartbeat election timeout at term[%d] leader[%d].", r.id, r.term, r.leader)
r.becomeFollower(r.term, NoLeader)
}
}
if r.state != stateLeader {
return
}
if r.heartbeatElapsed >= r.config.HeartbeatTick {
r.heartbeatElapsed = 0
for id := range r.replicas {
if id == r.config.NodeID {
continue
}
if r.replicas[id].state != replicaStateSnapshot {
r.replicas[id].resume()
}
}
r.bcastReadOnly()
}
}
func (r *raftFsm) tickElectionAck() {
r.electionElapsed++
if r.electionElapsed >= r.config.ElectionTick {
r.electionElapsed = 0
m := proto.GetMessage()
m.Type = proto.LocalMsgHup
m.From = r.config.NodeID
r.Step(m)
}
}
func (r *raftFsm) checkLeaderLease() bool {
var act int
for id, peer := range r.replicas {
if id == r.config.NodeID || peer.state == replicaStateSnapshot {
act++
continue
}
if peer.active {
peer.active = false
act++
} else {
r.monitorZombie(peer)
}
}
return act >= r.quorum()
}
func (r *raftFsm) maybeCommit() bool {
mis := make(util.Uint64Slice, 0, len(r.replicas))
for _, rp := range r.replicas {
mis = append(mis, rp.match)
}
sort.Sort(sort.Reverse(mis))
mci := mis[r.quorum()-1]
isCommit := r.raftLog.maybeCommit(mci, r.term)
if r.state == stateLeader && r.replicas[r.config.NodeID] != nil {
r.replicas[r.config.NodeID].committed = r.raftLog.committed
}
if r.state == stateLeader && !r.readOnly.committed && isCommit {
if r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(r.raftLog.committed)) == r.term {
r.readOnly.commit(r.raftLog.committed)
}
r.bcastReadOnly()
}
return isCommit
}
func (r *raftFsm) bcastAppend() {
for id := range r.replicas {
if id == r.config.NodeID {
continue
}
r.sendAppend(id)
}
}
func (r *raftFsm) sendAppend(to uint64) {
pr := r.replicas[to]
if pr.isPaused() {
return
}
var (
term uint64
ents []*proto.Entry
errt, erre error
m *proto.Message
)
fi := r.raftLog.firstIndex()
if pr.next >= fi {
term, errt = r.raftLog.term(pr.next - 1)
ents, erre = r.raftLog.entries(pr.next, r.config.MaxSizePerMsg)
}
if pr.next < fi || errt != nil || erre != nil {
if !pr.active {
if logger.IsEnableDebug() {
logger.Debug("[raft->sendAppend][%v]ignore sending snapshot to %v since it is not recently active.", r.id, to)
}
return
}
snapshot, err := r.sm.Snapshot()
if err != nil || snapshot.ApplyIndex() < fi-1 {
panic(AppPanicError(fmt.Sprintf("[raft->sendAppend][%v]failed to send snapshot[%d] to %v because snapshot is unavailable, error is: \r\n%v", r.id, snapshot.ApplyIndex(), to, err)))
}
m = proto.GetMessage()
m.Type = proto.ReqMsgSnapShot
m.To = to
m.Snapshot = snapshot
snapMeta := proto.SnapshotMeta{Index: snapshot.ApplyIndex(), Peers: make([]proto.Peer, 0, len(r.replicas))}
if snapTerm, err := r.raftLog.term(snapMeta.Index); err != nil {
panic(AppPanicError(fmt.Sprintf("[raft->sendAppend][%v]failed to send snapshot to %v because snapshot is unavailable, error is: \r\n%v", r.id, to, err)))
} else {
snapMeta.Term = snapTerm
}
for _, p := range r.replicas {
snapMeta.Peers = append(snapMeta.Peers, p.peer)
}
m.SnapshotMeta = snapMeta
pr.becomeSnapshot(snapMeta.Index)
logger.Debug("[raft->sendAppend][%v][firstindex: %d, commit: %d] sent snapshot[index: %d, term: %d] to [%v][%s]",
r.id, fi, r.raftLog.committed, snapMeta.Index, snapMeta.Term, to, pr)
} else {
m = proto.GetMessage()
m.Type = proto.ReqMsgAppend
m.To = to
m.Index = pr.next - 1
m.LogTerm = term
m.Commit = r.raftLog.committed
m.Entries = append(m.Entries, ents...)
if n := len(m.Entries); n != 0 {
switch pr.state {
case replicaStateReplicate:
last := m.Entries[n-1].Index
pr.update(last)
pr.inflight.add(last)
case replicaStateProbe:
pr.pause()
default:
errMsg := fmt.Sprintf("[repl->sendAppend][%v] is sending append in unhandled state %s.", r.id, pr.state)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
}
}
pr.pending = true
r.send(m)
}
func (r *raftFsm) appendEntry(es ...*proto.Entry) {
r.raftLog.append(es...)
r.replicas[r.config.NodeID].maybeUpdate(r.raftLog.lastIndex(), r.raftLog.committed)
r.maybeCommit()
}
func (r *raftFsm) bcastReadOnly() {
index := r.readOnly.lastPending()
if index == 0 {
return
}
if logger.IsEnableDebug() {
logger.Debug("raft[%d] bcast readonly index: %d", r.id, index)
}
for id := range r.replicas {
if id == r.config.NodeID {
continue
}
msg := proto.GetMessage()
msg.Type = proto.ReqCheckQuorum
msg.To = id
msg.Index = index
r.send(msg)
}
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
type (
fsmState byte
replicaState byte
)
const (
stateFollower fsmState = 0
stateCandidate fsmState = 1
stateLeader fsmState = 2
statePreCandidate fsmState = 3
replicaStateProbe replicaState = 0
replicaStateReplicate replicaState = 1
replicaStateSnapshot replicaState = 2
)
func (st fsmState) String() string {
switch st {
case 0:
return "StateFollower"
case 1:
return "StateCandidate"
case 2:
return "StateLeader"
case 3:
return "statePreCandidate"
}
return ""
}
func (st replicaState) String() string {
switch st {
case 1:
return "ReplicaStateReplicate"
case 2:
return "ReplicaStateSnapshot"
default:
return "ReplicaStateProbe"
}
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"math"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/storage"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
const noLimit = math.MaxUint64
// raftLog is responsible for the operation of the log.
type raftLog struct {
unstable unstable
storage storage.Storage
committed, applied uint64
}
func newRaftLog(storage storage.Storage) (*raftLog, error) {
log := &raftLog{
storage: storage,
}
firstIndex, err := storage.FirstIndex()
if err != nil {
return nil, err
}
lastIndex, err := storage.LastIndex()
if err != nil {
return nil, err
}
log.unstable.offset = lastIndex + 1
log.unstable.entries = make([]*proto.Entry, 0, 256)
log.committed = firstIndex - 1
log.applied = firstIndex - 1
return log, nil
}
func (l *raftLog) String() string {
return fmt.Sprintf("committed=%d, applied=%d, unstable.offset=%d, len(unstable.Entries)=%d", l.committed, l.applied, l.unstable.offset, len(l.unstable.entries))
}
func (l *raftLog) firstIndex() uint64 {
index, err := l.storage.FirstIndex()
if err != nil {
errMsg := fmt.Sprintf("[raftLog->firstIndex]get firstindex from storage err:[%v].", err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
return index
}
func (l *raftLog) lastIndex() uint64 {
if i, ok := l.unstable.maybeLastIndex(); ok {
return i
}
i, err := l.storage.LastIndex()
if err != nil {
errMsg := fmt.Sprintf("[raftLog->lastIndex]get lastIndex from storage err:[%v]", err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
return i
}
func (l *raftLog) term(i uint64) (uint64, error) {
dummyIndex := l.firstIndex() - 1
if i < dummyIndex || i > l.lastIndex() {
return 0, nil
}
if t, ok := l.unstable.maybeTerm(i); ok {
return t, nil
}
t, c, err := l.storage.Term(i)
if c {
return 0, ErrCompacted
}
if err == nil {
return t, nil
}
errMsg := fmt.Sprintf("[raftLog->term]get term[%d] from storage err:[%v].", i, err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
func (l *raftLog) lastTerm() uint64 {
t, err := l.term(l.lastIndex())
if err != nil {
errMsg := fmt.Sprintf("[raftLog->lastTerm]unexpected error when getting the last term (%v)", err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
return t
}
func (l *raftLog) lastIndexAndTerm() (uint64, uint64) {
li := l.lastIndex()
t, err := l.term(li)
if err != nil {
errMsg := fmt.Sprintf("[raftLog->lastIndexAndTerm]unexpected error when getting the last term (%v)", err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
return li, t
}
func (l *raftLog) matchTerm(i, term uint64) bool {
t, err := l.term(i)
if err != nil {
return false
}
return t == term
}
func (l *raftLog) findConflict(ents []*proto.Entry) uint64 {
for _, ne := range ents {
if !l.matchTerm(ne.Index, ne.Term) {
if ne.Index <= l.lastIndex() && logger.IsEnableDebug() {
logger.Debug("[raftLog->findConflict]found conflict at index %d [existing term: %d, conflicting term: %d]", ne.Index, l.zeroTermOnErrCompacted(l.term(ne.Index)), ne.Term)
}
return ne.Index
}
}
return 0
}
func (l *raftLog) maybeAppend(index, logTerm, committed uint64, ents ...*proto.Entry) (lastnewi uint64, ok bool) {
if l.matchTerm(index, logTerm) {
lastnewi = index + uint64(len(ents))
ci := l.findConflict(ents)
switch {
case ci == 0:
case ci <= l.committed:
errMsg := fmt.Sprintf("[raftLog->maybeAppend]entry %d conflict with committed entry [committed(%d)]", ci, l.committed)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
default:
l.append(ents[ci-(index+1):]...)
}
l.commitTo(util.Min(committed, lastnewi))
return lastnewi, true
}
return 0, false
}
func (l *raftLog) append(ents ...*proto.Entry) uint64 {
if len(ents) == 0 {
return l.lastIndex()
}
if after := ents[0].Index - 1; after < l.committed {
errMsg := fmt.Sprintf("[raftLog->append]after(%d) is out of range [committed(%d)]", after, l.committed)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
l.unstable.truncateAndAppend(ents)
return l.lastIndex()
}
func (l *raftLog) unstableEntries() []*proto.Entry {
if len(l.unstable.entries) == 0 {
return nil
}
return l.unstable.entries
}
func (l *raftLog) nextEnts(maxSize uint64) (ents []*proto.Entry) {
off := util.Max(l.applied+1, l.firstIndex())
hi := l.committed + 1
if hi > off {
ents, err := l.slice(off, hi, maxSize)
if err != nil {
errMsg := fmt.Sprintf("[raftLog->nextEnts]unexpected error when getting unapplied[%d,%d) entries (%v)", off, hi, err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
return ents
}
return nil
}
func (l *raftLog) entries(i uint64, maxsize uint64) ([]*proto.Entry, error) {
if i > l.lastIndex() {
return nil, nil
}
return l.slice(i, l.lastIndex()+1, maxsize)
}
func (l *raftLog) maybeCommit(maxIndex, term uint64) bool {
if maxIndex > l.committed && l.zeroTermOnErrCompacted(l.term(maxIndex)) == term {
l.commitTo(maxIndex)
return true
}
return false
}
func (l *raftLog) commitTo(tocommit uint64) {
if l.committed < tocommit {
if l.lastIndex() < tocommit {
errMsg := fmt.Sprintf("[raftLog->commitTo]tocommit(%d) is out of range [lastIndex(%d)]", tocommit, l.lastIndex())
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
l.committed = tocommit
}
}
func (l *raftLog) appliedTo(i uint64) {
if i == 0 {
return
}
if l.committed < i || i < l.applied {
errMsg := fmt.Sprintf("[raftLog->appliedTo]applied(%d) is out of range [prevApplied(%d), committed(%d)]", i, l.applied, l.committed)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
l.applied = i
}
func (l *raftLog) stableTo(i, t uint64) { l.unstable.stableTo(i, t) }
func (l *raftLog) isUpToDate(lasti, term uint64, fpri, lpri uint16) bool {
li, lt := l.lastIndexAndTerm()
return term > lt || (term == lt && lasti > li) || (term == lt && lasti == li && fpri >= lpri)
}
func (l *raftLog) restore(index uint64) {
if logger.IsEnableDebug() {
logger.Debug("[raftLog->restore]log [%s] starts to restore snapshot [index: %d]", l.String(), index)
}
l.committed = index
l.applied = index
l.unstable.restore(index)
}
func (l *raftLog) slice(lo, hi uint64, maxSize uint64) ([]*proto.Entry, error) {
if lo == hi {
return nil, nil
}
err := l.mustCheckOutOfBounds(lo, hi)
if err != nil {
return nil, err
}
var ents []*proto.Entry
if lo < l.unstable.offset {
storedhi := util.Min(hi, l.unstable.offset)
storedEnts, cmp, err := l.storage.Entries(lo, storedhi, maxSize)
if cmp {
return nil, ErrCompacted
} else if err != nil {
errMsg := fmt.Sprintf("[raftLog->slice]get entries[%d:%d) from storage err:[%v].", lo, storedhi, err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
// check if ents has reached the size limitation
if uint64(len(storedEnts)) < storedhi-lo {
return storedEnts, nil
}
ents = storedEnts
}
if hi > l.unstable.offset {
unstable := l.unstable.slice(util.Max(lo, l.unstable.offset), hi)
if len(ents) > 0 {
ents = append([]*proto.Entry{}, ents...)
ents = append(ents, unstable...)
} else {
ents = unstable
}
}
if maxSize == noLimit {
return ents, nil
}
return limitSize(ents, maxSize), nil
}
// l.firstIndex <= lo <= hi <= l.firstIndex + len(l.entries)
func (l *raftLog) mustCheckOutOfBounds(lo, hi uint64) error {
if lo > hi {
errMsg := fmt.Sprintf("[raftLog->mustCheckOutOfBounds]invalid slice %d > %d", lo, hi)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
fi := l.firstIndex()
if lo < fi {
return ErrCompacted
}
li := l.lastIndex()
length := li - fi + 1
if lo < fi || hi > fi+length {
errMsg := fmt.Sprintf("[raftLog->mustCheckOutOfBounds]slice[%d,%d) out of bound [%d,%d]", lo, hi, fi, li)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
return nil
}
func (l *raftLog) zeroTermOnErrCompacted(t uint64, err error) uint64 {
if err == nil {
return t
}
if err == ErrCompacted {
return 0
}
errMsg := fmt.Sprintf("[raftLog->zeroTermOnErrCompacted]unexpected error (%v)", err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
func (l *raftLog) allEntries() []*proto.Entry {
ents, err := l.entries(l.firstIndex(), noLimit)
if err == nil {
return ents
}
if err == ErrCompacted { // try again if there was a racing compaction
return l.allEntries()
}
errMsg := fmt.Sprintf("[log->allEntries]get all entries err:[%v]", err)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
func limitSize(ents []*proto.Entry, maxSize uint64) []*proto.Entry {
if len(ents) == 0 || maxSize == noLimit {
return ents
}
size := ents[0].Size()
limit := 1
for l := len(ents); limit < l; limit++ {
size += ents[limit].Size()
if size > maxSize {
break
}
}
return ents[:limit]
}
// findConflictByTerm takes an (index, term) pair (indicating a conflicting log
// entry on a leader/follower during an append) and finds the largest index in
// log l with a term <= `term` and an index <= `index`. If no such index exists
// in the log, the log's first index is returned.
//
// The index provided MUST be equal to or less than l.lastIndex(). Invalid
// inputs log a warning and the input index is returned.
func (l *raftLog) findConflictByTerm(index uint64, term uint64) uint64 {
if li := l.lastIndex(); index > li {
// NB: such calls should not exist, but since there is a straightfoward
// way to recover, do it.
//
// It is tempting to also check something about the first index, but
// there is odd behavior with peers that have no log, in which case
// lastIndex will return zero and firstIndex will return one, which
// leads to calls with an index of zero into this method.
logger.Warn("index(%d) is out of range [0, lastIndex(%d)] in findConflictByTerm",
index, li)
return index
}
for {
logTerm, err := l.term(index)
if logTerm <= term || err != nil {
break
}
index--
}
return index
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
// unstable temporary deposit the unpersistent log entries.It has log position i+unstable.offset.
// unstable can support group commit.
// Note that unstable.offset may be less than the highest log position in storage;
// this means that the next write to storage might need to truncate the log before persisting unstable.entries.
type unstable struct {
offset uint64
// all entries that have not yet been written to storage.
entries []*proto.Entry
}
// maybeLastIndex returns the last index if it has at least one unstable entry.
func (u *unstable) maybeLastIndex() (uint64, bool) {
if l := len(u.entries); l != 0 {
return u.offset + uint64(l) - 1, true
}
return 0, false
}
// myabeTerm returns the term of the entry at index i, if there is any.
func (u *unstable) maybeTerm(i uint64) (uint64, bool) {
if i < u.offset {
return 0, false
}
last, ok := u.maybeLastIndex()
if !ok || i > last {
return 0, false
}
return u.entries[i-u.offset].Term, true
}
func (u *unstable) stableTo(i, t uint64) {
gt, ok := u.maybeTerm(i)
if !ok {
return
}
if gt == t && i >= u.offset {
l := uint64(len(u.entries))
diff := l - (i + 1 - u.offset)
if diff > 0 {
copy(u.entries, u.entries[i+1-u.offset:l])
}
for k := diff; k < l; k++ {
u.entries[k] = nil
}
u.entries = u.entries[0:diff]
u.offset = i + 1
}
}
func (u *unstable) restore(index uint64) {
for i, l := 0, len(u.entries); i < l; i++ {
u.entries[i] = nil
}
u.entries = u.entries[0:0]
u.offset = index + 1
}
func (u *unstable) truncateAndAppend(ents []*proto.Entry) {
after := ents[0].Index
switch {
case after == u.offset+uint64(len(u.entries)):
// after is the next index in the u.entries directly append
u.entries = append(u.entries, ents...)
case after <= u.offset:
// The log is being truncated to before our current offset portion, so set the offset and replace the entries
for i, l := 0, len(u.entries); i < l; i++ {
u.entries[i] = nil
}
u.entries = append(u.entries[0:0], ents...)
u.offset = after
default:
// truncate to after and copy to u.entries then append
u.entries = append(u.entries[0:0], u.slice(u.offset, after)...)
u.entries = append(u.entries, ents...)
}
}
func (u *unstable) slice(lo uint64, hi uint64) []*proto.Entry {
u.mustCheckOutOfBounds(lo, hi)
return u.entries[lo-u.offset : hi-u.offset]
}
// u.offset <= lo <= hi <= u.offset+len(u.offset)
func (u *unstable) mustCheckOutOfBounds(lo, hi uint64) {
if lo > hi {
errMsg := fmt.Sprintf("unstable.slice[%d,%d) is invalid.", lo, hi)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
upper := u.offset + uint64(len(u.entries))
if lo < u.offset || hi > upper {
errMsg := fmt.Sprintf("unstable.slice[%d,%d) out of bound [%d,%d].", lo, hi, u.offset, upper)
logger.Error(errMsg)
panic(AppPanicError(errMsg))
}
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
// replication represents a follower’s progress of replicate in the view of the leader.
// Leader maintains progresses of all followers, and sends entries to the follower based on its progress.
type replica struct {
inflight
peer proto.Peer
state replicaState
paused, active, pending bool
match, next, committed, pendingSnap uint64
lastActive time.Time
lastZombie time.Time
}
func newReplica(peer proto.Peer, maxInflight int) *replica {
repl := &replica{
peer: peer,
state: replicaStateProbe,
lastActive: time.Now(),
}
if maxInflight > 0 {
repl.inflight.size = maxInflight
repl.inflight.buffer = make([]uint64, maxInflight)
}
return repl
}
func (r *replica) resetState(state replicaState) {
logger.Debug("raft resetState from [%v]", r)
r.paused = false
r.pendingSnap = 0
r.state = state
logger.Debug("raft resetState to [%v]", r)
r.reset()
}
func (r *replica) becomeProbe() {
if r.state == replicaStateSnapshot {
pendingSnap := r.pendingSnap
r.resetState(replicaStateProbe)
r.next = util.Max(r.match+1, pendingSnap+1)
} else {
r.resetState(replicaStateProbe)
r.next = r.match + 1
}
}
func (r *replica) becomeReplicate() {
r.resetState(replicaStateReplicate)
r.next = r.match + 1
}
func (r *replica) becomeSnapshot(index uint64) {
r.resetState(replicaStateSnapshot)
r.pendingSnap = index
}
func (r *replica) update(index uint64) {
r.next = index + 1
}
func (r *replica) maybeUpdate(index, commit uint64) bool {
updated := false
if r.committed < commit {
r.committed = commit
}
if r.match < index {
r.match = index
updated = true
r.resume()
}
next := index + 1
if r.next < next {
r.next = next
}
return updated
}
func (r *replica) maybeDecrTo(rejected, last, commit uint64) bool {
if r.state == replicaStateReplicate {
if r.committed < commit {
r.committed = commit
}
if rejected <= r.match {
return false
}
r.next = r.match + 1
return true
}
//Probe State
if r.next-1 != rejected {
return false
}
if r.next = util.Min(rejected, last+1); r.next < 1 {
r.next = 1
}
r.committed = commit
r.resume()
return true
}
func (r *replica) snapshotFailure() { r.pendingSnap = 0 }
func (r *replica) needSnapshotAbort() bool {
return r.state == replicaStateSnapshot && r.match >= r.pendingSnap
}
func (r *replica) pause() { r.paused = true }
func (r *replica) resume() { r.paused = false }
func (r *replica) isPaused() bool {
switch r.state {
case replicaStateProbe:
return r.paused
case replicaStateSnapshot:
return true
default:
return r.full()
}
}
func (r *replica) String() string {
return fmt.Sprintf("next = %d, match = %d, commit = %d, state = %s, waiting = %v, pendingSnapshot = %d", r.next, r.match, r.committed, r.state, r.isPaused(), r.pendingSnap)
}
// inflight is the replication sliding window,avoid overflowing that sending buffer.
type inflight struct {
start int
count int
size int
buffer []uint64
}
func (in *inflight) add(index uint64) {
if in.full() {
panic(AppPanicError(fmt.Sprint("inflight.add cannot add into a full inflights.")))
}
next := in.start + in.count
if next >= in.size {
next = next - in.size
}
in.buffer[next] = index
in.count = in.count + 1
}
func (in *inflight) freeTo(index uint64) {
if in.count == 0 || index < in.buffer[in.start] {
return
}
i, idx := 0, in.start
for ; i < in.count; i++ {
if index < in.buffer[idx] {
break
}
if idx = idx + 1; idx >= in.size {
idx = idx - in.size
}
}
in.count = in.count - i
in.start = idx
}
func (in *inflight) freeFirstOne() {
in.freeTo(in.buffer[in.start])
}
func (in *inflight) full() bool {
return in.count == in.size
}
func (in *inflight) reset() {
in.count = 0
in.start = 0
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"encoding/binary"
"fmt"
"io"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type snapshotStatus struct {
respErr
stopCh chan struct{}
}
func newSnapshotStatus() *snapshotStatus {
f := &snapshotStatus{
stopCh: make(chan struct{}),
}
f.init()
return f
}
type snapshotRequest struct {
respErr
snapshotReader
header *proto.Message
}
func newSnapshotRequest(m *proto.Message, r *util.BufferReader) *snapshotRequest {
f := &snapshotRequest{
header: m,
snapshotReader: snapshotReader{reader: r},
}
f.init()
return f
}
func (r *snapshotRequest) response() error {
return <-r.error()
}
type snapshotReader struct {
reader *util.BufferReader
err error
}
func (r *snapshotReader) Next() ([]byte, error) {
if r.err != nil {
return nil, r.err
}
// read size header
// r.reader.Reset()
var buf []byte
if buf, r.err = r.reader.ReadFull(4); r.err != nil {
return nil, r.err
}
size := uint64(binary.BigEndian.Uint32(buf))
if size == 0 {
r.err = io.EOF
return nil, r.err
}
// read data
// r.reader.Reset()
if buf, r.err = r.reader.ReadFull(int(size)); r.err != nil {
return nil, r.err
}
return buf, nil
}
func (s *raft) addSnapping(nodeID uint64, rs *snapshotStatus) {
s.mu.Lock()
defer s.mu.Unlock()
if snap, ok := s.snapping[nodeID]; ok {
close(snap.stopCh)
}
s.snapping[nodeID] = rs
}
func (s *raft) removeSnapping(nodeID uint64) {
s.mu.Lock()
defer s.mu.Unlock()
if snap, ok := s.snapping[nodeID]; ok {
close(snap.stopCh)
delete(s.snapping, nodeID)
}
}
func (s *raft) stopSnapping() {
s.mu.Lock()
defer s.mu.Unlock()
for id, snap := range s.snapping {
close(snap.stopCh)
delete(s.snapping, id)
}
}
func (s *raft) sendSnapshot(m *proto.Message) {
util.RunWorker(func() {
defer func() {
logger.Debug(" [raft] [%v term: %d] raftFm[%p] raftReplicas[%v] stop send snapshot "+
"without the replica from [%v]. to [%v]",
s.raftFsm.id, s.raftFsm.term, s.raftFsm, s.raftFsm.getReplicas(), m.Type, m.From, m.To)
s.removeSnapping(m.To)
m.Snapshot.Close()
proto.ReturnMessage(m)
}()
logger.Debug(" [raft] [%v term: %d] raftFm[%p] raftReplicas[%v] send snapshot "+
"without the replica from [%v ] to [%v].",
s.raftFsm.id, s.raftFsm.term, s.raftFsm, s.raftFsm.getReplicas(), m.Type, m.From, m.To)
// send snapshot
rs := newSnapshotStatus()
s.addSnapping(m.To, rs)
s.config.transport.SendSnapshot(m, rs)
select {
case <-s.stopc:
return
case <-rs.stopCh:
return
case err := <-rs.error():
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgSnapShot
nmsg.ID = m.ID
nmsg.From = m.To
nmsg.Reject = (err != nil)
s.recvc <- nmsg
}
}, func(err interface{}) {
s.doStop()
s.handlePanic(err)
})
}
func (s *raft) handleSnapshot(req *snapshotRequest) {
s.restoringSnapshot.Set(true)
var err error
defer func() {
req.respond(err)
s.resetTick()
s.restoringSnapshot.Set(false)
proto.ReturnMessage(req.header)
}()
// validate snapshot
if req.header.Term < s.raftFsm.term {
err = fmt.Errorf("raft %v [term: %d] ignored a snapshot message with lower term from %v [term: %d]", s.raftFsm.id, s.raftFsm.term, req.header.From, req.header.Term)
return
}
if req.header.Term > s.raftFsm.term || s.raftFsm.state != stateFollower {
s.raftFsm.becomeFollower(req.header.Term, req.header.From)
s.maybeChange(true)
}
if !s.raftFsm.checkSnapshot(req.header.SnapshotMeta) {
logger.Warn("raft %v [commit: %d] ignored snapshot [index: %d, term: %d].", s.raftFsm.id, s.raftFsm.raftLog.committed, req.header.SnapshotMeta.Index, req.header.SnapshotMeta.Term)
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgAppend
nmsg.To = req.header.From
nmsg.Index = s.raftFsm.raftLog.committed
nmsg.Commit = s.raftFsm.raftLog.committed
s.raftFsm.send(nmsg)
return
}
// restore snapshot
s.raftConfig.Storage.ApplySnapshot(proto.SnapshotMeta{})
if err = s.raftConfig.StateMachine.ApplySnapshot(req.header.SnapshotMeta.Peers, req); err != nil {
return
}
if err = s.raftConfig.Storage.ApplySnapshot(req.header.SnapshotMeta); err != nil {
return
}
s.raftFsm.restore(req.header.SnapshotMeta)
s.peerState.replace(req.header.SnapshotMeta.Peers)
s.curApplied.Set(req.header.SnapshotMeta.Index)
// send snapshot response message
logger.Warn("raft %v [commit: %d] restored snapshot [index: %d, term: %d]",
s.raftFsm.id, s.raftFsm.raftLog.committed, req.header.SnapshotMeta.Index, req.header.SnapshotMeta.Term)
nmsg := proto.GetMessage()
nmsg.Type = proto.RespMsgAppend
nmsg.To = req.header.From
nmsg.Index = s.raftFsm.raftLog.lastIndex()
nmsg.Commit = s.raftFsm.raftLog.committed
s.raftFsm.send(nmsg)
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
)
// ReadOnlyOption read only option
type ReadOnlyOption int
const (
// ReadOnlySafe guarantees the linearizability of the read only request by
// communicating with the quorum. It is the default and suggested option.
ReadOnlySafe ReadOnlyOption = iota
// ReadOnlyLeaseBased ensures linearizability of the read only request by
// relying on the leader lease. It can be affected by clock drift.
// If the clock drift is unbounded, leader might keep the lease longer than it
// should (clock can move backward/pause without any bound). ReadIndex is not safe
// in that case.
ReadOnlyLeaseBased
)
type readIndexStatus struct {
index uint64
futures []*Future
acks map[uint64]struct{}
}
type readIndexReady struct {
index uint64
futures []*Future
}
type readOnly struct {
id uint64 // raft id
option ReadOnlyOption
// wait leader to commit an entry in current term
committed bool
// ReadIndex requests before leader commit entry in current term
scratch []*Future
// wait quorum ack
pendings map[uint64]*readIndexStatus
pendingQueue []uint64
// quorum acked, wait apply
readys map[uint64]*readIndexReady
readyQueue []uint64
}
func newReadOnly(id uint64, option ReadOnlyOption) *readOnly {
return &readOnly{
id: id,
option: option,
pendings: make(map[uint64]*readIndexStatus),
readys: make(map[uint64]*readIndexReady),
}
}
func (r *readOnly) addPending(index uint64, futures []*Future) {
if status, ok := r.pendings[index]; ok {
status.futures = append(status.futures, futures...)
return
}
// check index valid
if index <= r.lastPending() {
panic(AppPanicError(fmt.Sprintf("[raft->addReadOnly][%v] invalid index[%d]: less than last[%d]", r.id, index, r.lastPending())))
}
r.pendingQueue = append(r.pendingQueue, index)
r.pendings[index] = &readIndexStatus{
index: index,
futures: futures,
acks: make(map[uint64]struct{}),
}
}
func (r *readOnly) addReady(index uint64, futures []*Future) {
if status, ok := r.readys[index]; ok {
status.futures = append(status.futures, futures...)
return
}
r.readyQueue = append(r.readyQueue, index)
r.readys[index] = &readIndexReady{
index: index,
futures: futures,
}
}
func (r *readOnly) add(index uint64, futures []*Future) {
if !r.committed {
r.scratch = append(r.scratch, futures...)
return
}
if r.option == ReadOnlyLeaseBased {
r.addReady(index, futures)
} else {
r.addPending(index, futures)
}
}
func (r *readOnly) commit(index uint64) {
if !r.committed {
r.committed = true
if len(r.scratch) > 0 {
r.add(index, r.scratch)
r.scratch = nil
}
}
}
func (r *readOnly) lastPending() uint64 {
if len(r.pendingQueue) > 0 {
return r.pendingQueue[len(r.pendingQueue)-1]
}
return 0
}
func (r *readOnly) recvAck(index uint64, from uint64, quorum int) {
status, ok := r.pendings[index]
if !ok {
return
}
status.acks[from] = struct{}{}
// add one to include an ack from local node
if len(status.acks)+1 >= quorum {
r.advance(index)
}
}
func (r *readOnly) advance(index uint64) {
var i int
for _, idx := range r.pendingQueue {
if idx > index {
break
}
if rs, ok := r.pendings[idx]; ok {
r.addReady(idx, rs.futures)
delete(r.pendings, idx)
}
i++
}
r.pendingQueue = r.pendingQueue[i:]
}
func (r *readOnly) getReady(applied uint64) (futures []*Future) {
if len(r.readyQueue) == 0 {
return nil
}
var i int
for _, idx := range r.readyQueue {
if idx > applied {
break
}
if rs, ok := r.readys[idx]; ok {
futures = append(futures, rs.futures...)
delete(r.readys, idx)
}
i++
}
r.readyQueue = r.readyQueue[i:]
// TODO: remove this when stable
if logger.IsEnableDebug() {
logger.Debug("raft[%d] get ready index %d, futures len: %d", r.id, applied, len(futures))
}
return
}
func (r *readOnly) containsUpdate(applied uint64) bool {
return len(r.readyQueue) > 0 && applied >= r.readyQueue[0]
}
func (r *readOnly) reset(err error) {
respondReadIndex(r.scratch, err)
for _, status := range r.pendings {
respondReadIndex(status.futures, err)
}
for _, ready := range r.readys {
respondReadIndex(ready.futures, err)
}
r.committed = false
r.scratch = nil
r.pendings = make(map[uint64]*readIndexStatus)
r.pendingQueue = nil
r.readys = make(map[uint64]*readIndexReady)
}
func respondReadIndex(future []*Future, err error) {
for _, f := range future {
f.respond(nil, err)
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"errors"
"sync"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
var (
fatalStopc = make(chan uint64)
)
type RaftServer struct {
config *Config
ticker *time.Ticker
heartc chan *proto.Message
stopc chan struct{}
mu sync.RWMutex
rafts map[uint64]*raft
}
func (rs *RaftServer) RemoveRaftForce(raftId uint64, cc *proto.ConfChange) {
var s *raft
var ok bool
if s, ok = rs.rafts[raftId]; !ok {
return
}
// repl apply
peerChange := cc.Peer
for _, replica := range s.raftFsm.replicas {
logger.Info("raft[%v] replias [%v]", s.raftFsm.id, replica.peer.String())
}
s.raftFsm.removePeer(cc.Peer)
if _, ok := s.raftFsm.replicas[peerChange.PeerID]; !ok {
if logger.IsEnableWarn() {
logger.Warn("raft[%v] applying configuration peer [%v] be removed and stop snapshot", s.raftFsm.id, peerChange)
}
s.removeSnapping(peerChange.PeerID)
s.peerState.change(cc)
if logger.IsEnableWarn() {
logger.Warn("raft[%v] applying configuration change %v.", s.raftFsm.id, cc)
}
}
}
func NewRaftServer(config *Config) (*RaftServer, error) {
if err := config.validate(); err != nil {
return nil, err
}
rs := &RaftServer{
config: config,
ticker: time.NewTicker(config.TickInterval),
rafts: make(map[uint64]*raft),
heartc: make(chan *proto.Message, 512),
stopc: make(chan struct{}),
}
if transport, err := NewMultiTransport(rs, &config.TransportConfig); err != nil {
return nil, err
} else {
rs.config.transport = transport
}
util.RunWorkerUtilStop(rs.run, rs.stopc)
return rs, nil
}
func (rs *RaftServer) run() {
ticks := 0
for {
select {
case <-rs.stopc:
return
case id := <-fatalStopc:
rs.mu.Lock()
delete(rs.rafts, id)
rs.mu.Unlock()
case m := <-rs.heartc:
switch m.Type {
case proto.ReqMsgHeartBeat:
rs.handleHeartbeat(m)
case proto.RespMsgHeartBeat:
rs.handleHeartbeatResp(m)
}
case <-rs.ticker.C:
ticks++
if ticks >= rs.config.HeartbeatTick {
ticks = 0
rs.sendHeartbeat()
}
rs.mu.RLock()
for _, raft := range rs.rafts {
raft.tick()
}
rs.mu.RUnlock()
}
}
}
func (rs *RaftServer) Stop() {
rs.mu.Lock()
defer rs.mu.Unlock()
select {
case <-rs.stopc:
return
default:
close(rs.stopc)
rs.ticker.Stop()
wg := new(sync.WaitGroup)
for id, s := range rs.rafts {
delete(rs.rafts, id)
wg.Add(1)
go func(r *raft) {
defer wg.Done()
r.stop()
}(s)
}
wg.Wait()
rs.config.transport.Stop()
}
}
func (rs *RaftServer) CreateRaft(raftConfig *RaftConfig) error {
var (
raft *raft
err error
)
defer func() {
if err != nil {
logger.Error("CreateRaft [%v] failed, error is:\r\n %s", raftConfig.ID, err.Error())
return
}
logger.Info("Create Raft success, id:%d", raftConfig.ID)
}()
if raft, err = newRaft(rs.config, raftConfig); err != nil {
return err
}
if raft == nil {
err = errors.New("CreateRaft return nil, maybe occur panic.")
return err
}
rs.mu.Lock()
defer rs.mu.Unlock()
if _, ok := rs.rafts[raftConfig.ID]; ok {
raft.stop()
err = ErrRaftExists
return err
}
rs.rafts[raftConfig.ID] = raft
return nil
}
func (rs *RaftServer) RemoveRaft(id uint64) error {
rs.mu.Lock()
raft, ok := rs.rafts[id]
delete(rs.rafts, id)
rs.mu.Unlock()
if ok {
raft.stop()
}
return nil
}
func (rs *RaftServer) Submit(id uint64, cmd []byte) (future *Future) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
future = newFuture()
if !ok {
future.respond(nil, ErrRaftNotExists)
return
}
raft.propose(cmd, future)
return
}
func (rs *RaftServer) ChangeMember(id uint64, changeType proto.ConfChangeType, peer proto.Peer, context []byte) (future *Future) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
future = newFuture()
if !ok {
future.respond(nil, ErrRaftNotExists)
return
}
raft.proposeMemberChange(&proto.ConfChange{Type: changeType, Peer: peer, Context: context}, future)
return
}
func (rs *RaftServer) IsRestoring(id uint64) bool {
rs.mu.RLock()
defer rs.mu.RUnlock()
if raft, ok := rs.rafts[id]; ok {
return raft.restoringSnapshot.Get() && raft.applied() == 0
}
return true
}
func (rs *RaftServer) Status(id uint64) (status *Status) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if ok {
status = raft.status()
}
if status == nil {
status = &Status{
ID: id,
NodeID: rs.config.NodeID,
Stopped: true,
}
}
return
}
func (rs *RaftServer) LeaderTerm(id uint64) (leader, term uint64) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if ok {
return raft.leaderTerm()
}
return NoLeader, 0
}
func (rs *RaftServer) IsLeader(id uint64) bool {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if ok {
return raft.isLeader()
}
return false
}
func (rs *RaftServer) AppliedIndex(id uint64) uint64 {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if ok {
return raft.applied()
}
return 0
}
func (rs *RaftServer) CommittedIndex(id uint64) uint64 {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if ok {
return raft.committed()
}
return 0
}
func (rs *RaftServer) FirstCommittedIndex(id uint64) uint64 {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if ok {
return raft.raftFsm.raftLog.firstIndex()
}
return 0
}
func (rs *RaftServer) TryToLeader(id uint64) (future *Future) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
future = newFuture()
if !ok {
future.respond(nil, ErrRaftNotExists)
return
}
raft.tryToLeader(future)
return
}
func (rs *RaftServer) Truncate(id uint64, index uint64) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if !ok {
return
}
raft.truncate(index)
}
func (rs *RaftServer) GetUnreachable(id uint64) (nodes []uint64) {
downReplicas := rs.GetDownReplicas(id)
for _, r := range downReplicas {
nodes = append(nodes, r.NodeID)
}
return
}
// GetDownReplicas 获取down的副本
func (rs *RaftServer) GetDownReplicas(id uint64) (downReplicas []DownReplica) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if !ok {
return nil
}
status := raft.status()
if status != nil && len(status.Replicas) > 0 {
for n, r := range status.Replicas {
if n == rs.config.NodeID {
continue
}
since := time.Since(r.LastActive)
// 两次心跳内没活跃就视为Down
downDuration := since - time.Duration(2*rs.config.HeartbeatTick)*rs.config.TickInterval
if downDuration > 0 {
downReplicas = append(downReplicas, DownReplica{
NodeID: n,
DownSeconds: int(downDuration / time.Second),
})
}
}
}
return
}
// GetPendingReplica get snapshot pending followers
func (rs *RaftServer) GetPendingReplica(id uint64) (peers []uint64) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
if !ok {
return nil
}
status := raft.status()
if status != nil && len(status.Replicas) > 0 {
for n, r := range status.Replicas {
if n == rs.config.NodeID {
continue
}
if r.Snapshoting {
peers = append(peers, n)
}
}
}
return
}
// ReadIndex read index
func (rs *RaftServer) ReadIndex(id uint64) (future *Future) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
future = newFuture()
if !ok {
future.respond(nil, ErrRaftNotExists)
return
}
raft.readIndex(future)
return
}
// GetEntries get raft log entries
func (rs *RaftServer) GetEntries(id uint64, startIndex uint64, maxSize uint64) (future *Future) {
rs.mu.RLock()
raft, ok := rs.rafts[id]
rs.mu.RUnlock()
future = newFuture()
if !ok {
future.respond(nil, ErrRaftNotExists)
return
}
raft.getEntries(future, startIndex, maxSize)
return
}
func (rs *RaftServer) sendHeartbeat() {
// key: sendto nodeId; value: range ids
nodes := make(map[uint64]proto.HeartbeatContext)
rs.mu.RLock()
for id, raft := range rs.rafts {
if !raft.isLeader() {
continue
}
peers := raft.getPeers()
for _, p := range peers {
nodes[p] = append(nodes[p], id)
}
}
rs.mu.RUnlock()
for to, ctx := range nodes {
if to == rs.config.NodeID {
continue
}
msg := proto.GetMessage()
msg.Type = proto.ReqMsgHeartBeat
msg.From = rs.config.NodeID
msg.To = to
msg.Context = proto.EncodeHBConext(ctx)
rs.config.transport.Send(msg)
}
}
func (rs *RaftServer) handleHeartbeat(m *proto.Message) {
ctx := proto.DecodeHBContext(m.Context)
var respCtx proto.HeartbeatContext
rs.mu.RLock()
for _, id := range ctx {
if raft, ok := rs.rafts[id]; ok {
raft.reciveMessage(m)
respCtx = append(respCtx, id)
}
}
rs.mu.RUnlock()
msg := proto.GetMessage()
msg.Type = proto.RespMsgHeartBeat
msg.From = rs.config.NodeID
msg.To = m.From
msg.Context = proto.EncodeHBConext(respCtx)
rs.config.transport.Send(msg)
}
func (rs *RaftServer) handleHeartbeatResp(m *proto.Message) {
ctx := proto.DecodeHBContext(m.Context)
rs.mu.RLock()
defer rs.mu.RUnlock()
for _, id := range ctx {
if raft, ok := rs.rafts[id]; ok {
raft.reciveMessage(m)
}
}
}
func (rs *RaftServer) reciveMessage(m *proto.Message) {
if m.Type == proto.ReqMsgHeartBeat || m.Type == proto.RespMsgHeartBeat {
rs.heartc <- m
return
}
rs.mu.RLock()
raft, ok := rs.rafts[m.ID]
rs.mu.RUnlock()
if ok {
raft.reciveMessage(m)
}
}
func (rs *RaftServer) reciveSnapshot(req *snapshotRequest) {
rs.mu.RLock()
raft, ok := rs.rafts[req.header.ID]
rs.mu.RUnlock()
if !ok {
req.respond(ErrRaftNotExists)
return
}
raft.reciveSnapshot(req)
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
// The StateMachine interface is supplied by the application to persist/snapshot data of application.
type StateMachine interface {
Apply(command []byte, index uint64) (interface{}, error)
ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error)
Snapshot() (proto.Snapshot, error)
ApplySnapshot(peers []proto.Peer, iter proto.SnapIterator) error
HandleFatalEvent(err *FatalError)
HandleLeaderChange(leader uint64)
}
type SocketType byte
const (
HeartBeat SocketType = 0
Replicate SocketType = 1
)
func (t SocketType) String() string {
switch t {
case 0:
return "HeartBeat"
case 1:
return "Replicate"
}
return "unkown"
}
// The SocketResolver interface is supplied by the application to resolve NodeID to net.Addr addresses.
type SocketResolver interface {
NodeAddress(nodeID uint64, stype SocketType) (addr string, err error)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"time"
)
// DownReplica down replica
type DownReplica struct {
NodeID uint64
DownSeconds int
}
// ReplicaStatus replica status
type ReplicaStatus struct {
Match uint64 // 复制进度
Commit uint64 // commmit位置
Next uint64
State string
Snapshoting bool
Paused bool
Active bool
LastActive time.Time
Inflight int
}
// Status raft status
type Status struct {
ID uint64
NodeID uint64
Leader uint64
Term uint64
Index uint64
Commit uint64
Applied uint64
Vote uint64
PendQueue int
RecvQueue int
AppQueue int
Stopped bool
RestoringSnapshot bool
State string // leader、follower、candidate
Replicas map[uint64]*ReplicaStatus
}
func (s *Status) String() string {
st := "running"
if s.Stopped {
st = "stopped"
} else if s.RestoringSnapshot {
st = "snapshot"
}
j := fmt.Sprintf(`{"id":"%v","nodeID":"%v","state":"%v","leader":"%v","term":"%v","index":"%v","commit":"%v","applied":"%v","vote":"%v","pendingQueue":"%v",
"recvQueue":"%v","applyQueue":"%v","status":"%v","replication":{`, s.ID, s.NodeID, s.State, s.Leader, s.Term, s.Index, s.Commit, s.Applied, s.Vote, s.PendQueue, s.RecvQueue, s.AppQueue, st)
if len(s.Replicas) == 0 {
j += "}}"
} else {
for k, v := range s.Replicas {
p := "false"
if v.Paused {
p = "true"
}
subj := fmt.Sprintf(`"%v":{"match":"%v","commit":"%v","next":"%v","state":"%v","paused":"%v","inflight":"%v","active":"%v"},`, k, v.Match, v.Commit, v.Next, v.State, p, v.Inflight, v.Active)
j += subj
}
j = j[:len(j)-1] + "}}"
}
return j
}
// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"errors"
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type fsm interface {
AppliedIndex(id uint64) uint64
}
// This storage is circular storage in memory and truncate when over capacity,
// but keep it a high capacity.
type MemoryStorage struct {
fsm fsm
id uint64
// the threshold of truncate
capacity uint64
// the index of last truncate
truncIndex uint64
truncTerm uint64
// the starting offset in the ents
start uint64
// the actual log in the ents
count uint64
// the total size of the ents
size uint64
// ents[i] has raft log position i+snapshot.Metadata.Index
ents []*proto.Entry
hardState proto.HardState
}
func NewMemoryStorage(fsm fsm, id, capacity uint64) *MemoryStorage {
if logger.IsEnableWarn() {
logger.Warn("Memory Storage capacity is: %v.", capacity)
}
return &MemoryStorage{
fsm: fsm,
id: id,
capacity: capacity,
size: capacity,
ents: make([]*proto.Entry, capacity),
}
}
func DefaultMemoryStorage() *MemoryStorage {
return NewMemoryStorage(nil, 0, 4096)
}
func (ms *MemoryStorage) InitialState() (proto.HardState, error) {
return ms.hardState, nil
}
func (ms *MemoryStorage) FirstIndex() (uint64, error) {
return ms.truncIndex + 1, nil
}
func (ms *MemoryStorage) LastIndex() (uint64, error) {
return ms.lastIndex(), nil
}
func (ms *MemoryStorage) lastIndex() uint64 {
return ms.truncIndex + ms.count
}
func (ms *MemoryStorage) Term(index uint64) (term uint64, isCompact bool, err error) {
switch {
case index < ms.truncIndex:
return 0, true, nil
case index == ms.truncIndex:
return ms.truncTerm, false, nil
default:
return ms.ents[ms.locatePosition(index)].Term, false, nil
}
}
func (ms *MemoryStorage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) {
if lo <= ms.truncIndex {
return nil, true, nil
}
if hi > ms.lastIndex()+1 {
return nil, false, fmt.Errorf("[MemoryStorage->Entries]entries's hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex())
}
// only contains dummy entries.
if ms.count == 0 {
return nil, false, errors.New("requested entry at index is unavailable")
}
count := hi - lo
if count <= 0 {
return []*proto.Entry{}, false, nil
}
retEnts := make([]*proto.Entry, count)
pos := ms.locatePosition(lo)
retEnts[0] = ms.ents[pos]
size := ms.ents[pos].Size()
limit := uint64(1)
for ; limit < count; limit++ {
pos = pos + 1
if pos >= ms.size {
pos = pos - ms.size
}
size = size + ms.ents[pos].Size()
if uint64(size) > maxSize {
break
}
retEnts[limit] = ms.ents[pos]
}
return retEnts[:limit], false, nil
}
// StoreEntries equal etcd raft append
func (ms *MemoryStorage) StoreEntries(entries []*proto.Entry) error {
if len(entries) == 0 {
return nil
}
appIndex := uint64(0)
if ms.fsm != nil {
appIndex = ms.fsm.AppliedIndex(ms.id)
}
first := appIndex + 1
last := entries[0].Index + uint64(len(entries)) - 1
if last < first {
// shortcut if there is no new entry.
return nil
}
if first > entries[0].Index {
// truncate compacted entries
entries = entries[first-entries[0].Index:]
}
offset := entries[0].Index - ms.truncIndex - 1
if ms.count < offset {
logger.Error("missing log entry [last: %d, append at: %d]", ms.lastIndex(), entries[0].Index)
return nil
}
// resize and truncate compacted ents
entriesSize := uint64(len(entries))
maxSize := offset + entriesSize
minSize := maxSize - (appIndex - ms.truncIndex)
switch {
case minSize > ms.capacity:
// truncate compacted ents
if ms.truncIndex < appIndex {
ms.truncateTo(appIndex)
}
// grow ents
if minSize > ms.size {
ms.resize(ms.capacity+minSize, minSize)
}
default:
// truncate compacted ents
if maxSize > ms.capacity {
cmpIdx := util.Min(appIndex, maxSize-ms.capacity+ms.truncIndex)
if ms.truncIndex < cmpIdx {
ms.truncateTo(cmpIdx)
}
}
// short ents
if ms.size > ms.capacity {
ms.resize(ms.capacity, maxSize)
}
}
// append new entries
start := ms.locatePosition(entries[0].Index)
next := start + entriesSize
if next <= ms.size {
copy(ms.ents[start:], entries)
if ms.start <= start {
ms.count = next - ms.start
} else {
ms.count = (ms.size - ms.start) + (next - 0)
}
} else {
count := ms.size - start
copy(ms.ents[start:], entries[0:count])
copy(ms.ents[0:], entries[count:])
ms.count = (ms.size - ms.start) + (entriesSize - count)
}
return nil
}
func (ms *MemoryStorage) StoreHardState(st proto.HardState) error {
ms.hardState = st
return nil
}
func (ms *MemoryStorage) ApplySnapshot(meta proto.SnapshotMeta) error {
ms.truncIndex = meta.Index
ms.truncTerm = meta.Term
ms.start = 0
ms.count = 0
ms.size = ms.capacity
ms.ents = make([]*proto.Entry, ms.capacity)
return nil
}
func (ms *MemoryStorage) Truncate(index uint64) error {
if index == 0 || index <= ms.truncIndex {
return errors.New("requested index is unavailable due to compaction")
}
if index > ms.lastIndex() {
return fmt.Errorf("compact %d is out of bound lastindex(%d)", index, ms.lastIndex())
}
ms.truncateTo(index)
return nil
}
func (ms *MemoryStorage) Close() {
}
func (ms *MemoryStorage) truncateTo(index uint64) {
ms.truncTerm = ms.ents[ms.locatePosition(index)].Term
ms.start = ms.locatePosition(index + 1)
ms.count = ms.count - (index - ms.truncIndex)
ms.truncIndex = index
}
func (ms *MemoryStorage) resize(capacity, needSize uint64) {
ents := make([]*proto.Entry, capacity)
count := util.Min(util.Min(capacity, ms.count), needSize)
next := ms.start + count
if next <= ms.size {
copy(ents, ms.ents[ms.start:next])
} else {
next = next - ms.size
copy(ents, ms.ents[ms.start:])
copy(ents[ms.size-ms.start:], ms.ents[0:next])
}
ms.start = 0
ms.count = count
ms.size = capacity
ms.ents = ents
}
func (ms *MemoryStorage) locatePosition(index uint64) uint64 {
position := ms.start + (index - ms.truncIndex - 1)
if position >= ms.size {
position = position - ms.size
}
return position
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
const (
DefaultFileCacheCapacity = 2
DefaultFileSize = 32 * util.MB
MinFileSize = 1 * util.MB
MaxRotateInterval = 86400
DefaultSync = false
)
// Config wal config
type Config struct {
// FileCacheCapacity 缓存多少个打开的日志文件(包括index等)
FileCacheCapacity int
// FileSize 日志文件的大小
FileSize int
Sync bool
// TruncateFirstDummy 初始化时添加一条日志然后截断
TruncateFirstDummy bool
}
func (c *Config) GetFileCacheCapacity() int {
if c == nil || c.FileCacheCapacity <= 0 {
return DefaultFileCacheCapacity
}
return c.FileCacheCapacity
}
func (c *Config) GetFileSize() int {
if c == nil || c.FileSize <= 0 {
return DefaultFileSize
}
return c.FileSize
}
func (c *Config) GetSync() bool {
if c == nil {
return DefaultSync
}
return c.Sync
}
func (c *Config) GetTruncateFirstDummy() bool {
if c == nil {
return false
}
return c.TruncateFirstDummy
}
func (c *Config) dup() *Config {
if c != nil {
dc := *c
return &dc
} else {
return nil
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/google/btree"
)
type cacheItem proto.Entry
func (c *cacheItem) Less(than btree.Item) bool {
return c.Index < than.(*cacheItem).Index
}
// cache中只保持最新的(index较大的)若干条日志
type entryCache struct {
capacity int
ents *btree.BTree
key *cacheItem
}
func newEntryCache(capacity int) *entryCache {
return &entryCache{
capacity: capacity,
ents: btree.New(4),
key: new(cacheItem),
}
}
func (c *entryCache) Get(index uint64) *proto.Entry {
c.key.Index = index
ent := c.ents.Get(c.key)
if ent != nil {
return (*proto.Entry)(ent.(*cacheItem))
} else {
return nil
}
}
func (c *entryCache) Append(ent *proto.Entry) {
// 截断冲突的
for c.ents.Len() > 0 && c.ents.Max().(*cacheItem).Index >= ent.Index {
c.ents.DeleteMax()
}
c.ents.ReplaceOrInsert((*cacheItem)(ent))
// keep capacity
for c.ents.Len() > c.capacity {
c.ents.DeleteMin()
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import "container/list"
type openFunc func(logFileName) (*logEntryFile, error)
type logFileCache struct {
capacity int
l *list.List
m map[logFileName]*list.Element // key是seq
f openFunc
}
func newLogFileCache(capacity int, f openFunc) *logFileCache {
return &logFileCache{
capacity: capacity,
l: list.New(),
m: make(map[logFileName]*list.Element, capacity),
f: f,
}
}
func (lc *logFileCache) Get(name logFileName) (lf *logEntryFile, err error) {
e, ok := lc.m[name]
if ok {
lf = (e.Value).(*logEntryFile)
lc.l.MoveToFront(e)
return
}
// 不存在打开新的
lf, err = lc.f(name)
if err != nil {
return
}
// 缓存
e = lc.l.PushFront(lf)
lc.m[name] = e
// keep capacity
for lc.l.Len() > lc.capacity {
e = lc.l.Back()
df := (e.Value).(*logEntryFile)
if err = lc.Delete(df.Name(), true); err != nil {
return nil, err
}
}
return
}
func (lc *logFileCache) Delete(name logFileName, close bool) error {
e, ok := lc.m[name]
if !ok {
return nil
}
lf := e.Value.(*logEntryFile)
if close {
if err := lf.Close(); err != nil {
return err
}
}
delete(lc.m, lf.Name())
lc.l.Remove(e)
return nil
}
func (lc *logFileCache) Close() (err error) {
for _, e := range lc.m {
f := (e.Value).(*logEntryFile)
err = f.Close()
}
return
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"errors"
"fmt"
"io"
"os"
"sort"
)
// 目录初始化 不存在则创建;存在检查路径是否是目录
func initDir(dir string) error {
info, err := os.Stat(dir)
if err != nil {
if pathErr, ok := err.(*os.PathError); ok {
if os.IsNotExist(pathErr) {
return os.MkdirAll(dir, 0755)
}
}
return err
}
if !info.IsDir() {
return errors.New("fbase/raftstore: path is not directory")
}
return nil
}
// 日志文件名的组成 seq-index.log
type logFileName struct {
seq uint64 // 文件序号
index uint64 // 起始index(log entry)
}
func (l *logFileName) String() string {
return fmt.Sprintf("%016x-%016x.log", l.seq, l.index)
}
func (l *logFileName) ParseFrom(s string) bool {
_, err := fmt.Sscanf(s, "%016x-%016x.log", &l.seq, &l.index)
return err == nil
}
type nameSlice []logFileName
func (s nameSlice) Len() int { return len(s) }
func (s nameSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s nameSlice) Less(i, j int) bool { return s[i].seq < s[j].seq }
// 枚举目录下的所有日志文件并按序号排序
func listLogEntryFiles(path string) (fnames []logFileName, err error) {
dir, err := os.Open(path)
if err != nil {
return nil, err
}
defer dir.Close()
names, err := dir.Readdirnames(0)
if err != nil {
return nil, err
}
for _, name := range names {
var n logFileName
if n.ParseFrom(name) {
fnames = append(fnames, n)
}
}
sort.Sort(nameSlice(fnames))
return
}
// 退化版本的预分配空间
func fallocDegraded(f *os.File, sizeInBytes int64) error {
curOff, err := f.Seek(0, io.SeekCurrent)
if err != nil {
return err
}
size, err := f.Seek(sizeInBytes, io.SeekEnd)
if err != nil {
return err
}
if _, err = f.Seek(curOff, io.SeekStart); err != nil {
return err
}
if sizeInBytes > size {
return nil
}
return f.Truncate(sizeInBytes)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build linux
// +build linux
package wal
import (
"os"
"syscall"
)
const (
fallocateModeDefault uint32 = 0 // 默认模式下预分配的空间全部补0
fallocateModeKeepSize uint32 = 1 // 预分配后保持原来的文件大小,不补0
)
func fdatasync(f *os.File) error {
return syscall.Fdatasync(int(f.Fd()))
}
// 预分配然后补零
func fallocate(f *os.File, sizeInBytes int64) error {
err := syscall.Fallocate(int(f.Fd()), fallocateModeDefault, 0, sizeInBytes)
if err != nil {
errno, ok := err.(syscall.Errno)
if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
return fallocDegraded(f, sizeInBytes)
}
}
return err
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bytes"
"io"
"os"
"path"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
)
type logEntryFile struct {
dir string
name logFileName
f *os.File
r recordReadAt
w *recordWriter
index logEntryIndex
}
func openLogEntryFile(dir string, name logFileName, isLastOne bool) (*logEntryFile, error) {
p := path.Join(dir, name.String())
f, err := os.OpenFile(p, os.O_RDWR|os.O_APPEND, 0600)
if err != nil {
return nil, err
}
lf := &logEntryFile{
dir: dir,
name: name,
f: f,
r: newRecordReader(f),
}
if !isLastOne {
// 读取索引数据
if err = lf.ReadIndex(); err != nil {
return nil, err
}
} else {
// 重建索引
toffset, err := lf.ReBuildIndex()
if err != nil && err != io.ErrUnexpectedEOF && !IsErrCorrupt(err) {
return nil, err
}
// 打开写
if err = lf.OpenWrite(); err != nil {
return nil, err
}
// 截断索引及后面的数据
if toffset >= 0 {
log.Warn("truncate last logfile's N@%d index at: %d", lf.name.seq, toffset)
if err := lf.w.Truncate(toffset); err != nil {
return nil, err
}
}
}
return lf, nil
}
func createLogEntryFile(dir string, name logFileName) (*logEntryFile, error) {
p := path.Join(dir, name.String())
f, err := os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC|os.O_APPEND, 0600)
if err != nil {
return nil, err
}
lf := &logEntryFile{
dir: dir,
name: name,
f: f,
r: newRecordReader(f),
}
if err := lf.OpenWrite(); err != nil {
return nil, err
}
return lf, nil
}
func (lf *logEntryFile) ReadIndex() error {
info, err := lf.f.Stat()
if err != nil {
return err
}
// read footer
var footer footerRecord
if info.Size() < int64(footer.Size()) {
return NewCorruptError(lf.f.Name(), 0, "too small footer")
}
offset := info.Size() - int64(recordSize(footer))
rec, err := lf.r.ReadAt(offset)
if err != nil {
return err
}
if rec.recType != recTypeFooter {
return NewCorruptError(lf.f.Name(), offset, "wrong footer record type")
}
if rec.dataLen != footer.Size() {
return NewCorruptError(lf.f.Name(), offset, "wrong footer size")
}
footer.Decode(rec.data)
if !bytes.Equal(footer.magic, footerMagic) {
return NewCorruptError(lf.f.Name(), offset, "wrong footer magic")
}
// read index data
offset = int64(footer.indexOffset)
rec, err = lf.r.ReadAt(offset)
if err != nil {
return err
}
if rec.recType != recTypeIndex {
return NewCorruptError(lf.f.Name(), offset, "wrong index record type")
}
lf.index = decodeLogIndex(rec.data)
return nil
}
func (lf *logEntryFile) ReBuildIndex() (truncateOffset int64, err error) {
lf.index = nil
// 获取文件大小
info, err := lf.f.Stat()
if err != nil {
return 0, err
}
filesize := info.Size()
var (
rec record
offset int64
nextRecordOffset int64
)
r := newRecordReader(lf.f)
for {
offset, rec, err = r.Read()
if err != nil {
break
}
nextRecordOffset = r.offset
// log entry 更新索引
if rec.recType == recTypeLogEntry {
ent := &proto.Entry{}
ent.Decode(rec.data)
lf.index = lf.index.Append(uint32(offset), ent)
} else {
// All valid log entries have been loaded
return offset, nil
}
}
if err == io.EOF {
err = nil
}
if filesize != nextRecordOffset {
log.Warn("logName[%v],fileSize[%v],corrupt data after offset[%v]", lf.name, filesize, nextRecordOffset)
}
return offset, err
}
func (lf *logEntryFile) Name() logFileName {
return lf.name
}
func (lf *logEntryFile) Seq() uint64 {
return lf.name.seq
}
func (lf *logEntryFile) Len() int {
return lf.index.Len()
}
func (lf *logEntryFile) FirstIndex() uint64 {
return lf.index.First()
}
func (lf *logEntryFile) LastIndex() uint64 {
return lf.index.Last()
}
// Get get log entry
func (lf *logEntryFile) Get(i uint64) (*proto.Entry, error) {
item, err := lf.index.Get(i)
if err != nil {
return nil, err
}
rec, err := lf.r.ReadAt(int64(item.offset))
if err != nil {
return nil, err
}
ent := &proto.Entry{}
ent.Decode(rec.data)
return ent, nil
}
// Term get log's term
func (lf *logEntryFile) Term(i uint64) (uint64, error) {
item, err := lf.index.Get(i)
if err != nil {
return 0, err
}
return item.logterm, nil
}
// Truncate 截断最近的日志
func (lf *logEntryFile) Truncate(index uint64) error {
if lf.Len() == 0 {
return nil
}
item, err := lf.index.Get(index)
if err != nil {
return err
}
// 截断文件
offset := int64(item.offset)
if err = lf.w.Truncate(offset); err != nil {
return err
}
// 截断索引
lf.index, err = lf.index.Truncate(index)
return err
}
func (lf *logEntryFile) Save(ent *proto.Entry) error {
// 写入文件
offset := lf.w.Offset()
if err := lf.w.Write(recTypeLogEntry, ent); err != nil {
return err
}
// 更新索引
lf.index = lf.index.Append(uint32(offset), ent)
return nil
}
func (lf *logEntryFile) OpenWrite() error {
if lf.w != nil {
return nil
}
lf.w = newRecordWriter(lf.f)
return nil
}
func (lf *logEntryFile) WriteOffset() int64 {
return lf.w.Offset()
}
func (lf *logEntryFile) Flush() error {
return lf.w.Flush()
}
// Sync flush write buffer and sync to disk
func (lf *logEntryFile) Sync() error {
return lf.w.Sync()
}
func (lf *logEntryFile) FinishWrite() error {
var err error
// write log index data
recOffset := lf.w.Offset()
if err = lf.w.Write(recTypeIndex, lf.index); err != nil {
return err
}
// write log file footer
footer := &footerRecord{
indexOffset: uint64(recOffset),
}
if err = lf.w.Write(recTypeFooter, footer); err != nil {
return err
}
if err := lf.w.Close(); err != nil {
return err
}
lf.w = nil
return nil
}
// Close 关闭读写,关闭文件
func (lf *logEntryFile) Close() error {
if lf.w != nil {
if err := lf.w.Close(); err != nil {
return err
}
lf.w = nil
}
return lf.f.Close()
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"encoding/binary"
"fmt"
"io"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
const indexItemSize = 8 + 8 + 4
type indexItem struct {
logindex uint64 // 日志的index
logterm uint64 // 日志的term
offset uint32 // 日志在文件中的偏移
}
type logEntryIndex []indexItem
func (li logEntryIndex) First() uint64 {
if len(li) == 0 {
return 0
}
return li[0].logindex
}
func (li logEntryIndex) Last() uint64 {
size := len(li)
if size == 0 {
return 0
}
return li[size-1].logindex
}
func (li logEntryIndex) Get(i uint64) (item indexItem, err error) {
size := len(li)
if size == 0 {
err = fmt.Errorf("maybe index(%d) is out of bound lastindex(%d)", i, li.Last())
return
}
ibegin := li[0].logindex
iend := li[size-1].logindex
if i < ibegin || i > iend {
err = fmt.Errorf("maybe index(%d) is out of bound lastindex(%d)", i, li.Last())
return
}
return li[i-ibegin], nil
}
func (li logEntryIndex) Append(offset uint32, entry *proto.Entry) logEntryIndex {
return append(li, indexItem{
logindex: entry.Index,
logterm: entry.Term,
offset: offset,
})
}
func (li logEntryIndex) Truncate(i uint64) (logEntryIndex, error) {
if _, err := li.Get(i); err != nil {
return nil, err
}
return li[:i-li[0].logindex], nil
}
func (li logEntryIndex) Len() int {
return len(li)
}
// 实现recordData接口Encode方法
func (li logEntryIndex) Encode(w io.Writer) (err error) {
u32Buf := make([]byte, 4)
u64Buf := make([]byte, 8)
// write index items count
binary.BigEndian.PutUint32(u32Buf, uint32(li.Len()))
if _, err = w.Write(u32Buf); err != nil {
return
}
// write indexs data
for _, item := range li {
// logindex
binary.BigEndian.PutUint64(u64Buf, item.logindex)
if _, err = w.Write(u64Buf); err != nil {
return
}
// logitem
binary.BigEndian.PutUint64(u64Buf, item.logterm)
if _, err = w.Write(u64Buf); err != nil {
return
}
// logoffset
binary.BigEndian.PutUint32(u32Buf, item.offset)
if _, err = w.Write(u32Buf); err != nil {
return
}
}
return
}
// 实现recordData接口Size方法
func (li logEntryIndex) Size() uint64 {
return uint64(4 + li.Len()*indexItemSize)
}
func decodeLogIndex(data []byte) logEntryIndex {
offset := 0
nItems := binary.BigEndian.Uint32(data[offset:])
offset += 4
li := make([]indexItem, nItems)
for i := 0; i < int(nItems); i++ {
li[i].logindex = binary.BigEndian.Uint64(data[offset:])
offset += 8
li[i].logterm = binary.BigEndian.Uint64(data[offset:])
offset += 8
li[i].offset = binary.BigEndian.Uint32(data[offset:])
offset += 4
}
return li
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"fmt"
"os"
"path"
"sort"
"math"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
"github.com/cubefs/cubefs/util/timeutil"
)
type logEntryStorage struct {
s *Storage
dir string
filesize int
rotateTime int64
logfiles []logFileName // 所有日志文件的名字
last *logEntryFile
nextFileSeq uint64
cache *logFileCache
}
func openLogStorage(dir string, s *Storage) (*logEntryStorage, error) {
ls := &logEntryStorage{
s: s,
dir: dir,
filesize: s.c.GetFileSize(),
rotateTime: timeutil.GetCurrentTimeUnix(),
nextFileSeq: 1,
}
// cache
ls.cache = newLogFileCache(s.c.GetFileCacheCapacity(),
func(name logFileName) (*logEntryFile, error) {
return openLogEntryFile(ls.dir, name, false)
})
// open
if err := ls.open(); err != nil {
return nil, err
}
return ls, nil
}
func (ls *logEntryStorage) open() error {
names, err := listLogEntryFiles(ls.dir)
if err != nil {
return err
}
// 没有历史文件,创建第一个起始index为0的文件
if len(names) == 0 {
f, err := ls.createNew(1)
if err != nil {
return err
}
ls.logfiles = append(ls.logfiles, f.Name())
ls.last = f
return nil
}
nlen := len(names)
ls.nextFileSeq = names[nlen-1].seq + 1 // next设为历史文件中seq最大的加1
ls.logfiles = append(ls.logfiles, names...)
f, err := openLogEntryFile(ls.dir, ls.logfiles[nlen-1], true) // 打开最后一个文件
if err != nil {
return err
}
ls.last = f
return nil
}
func (ls *logEntryStorage) Term(i uint64) (term uint64, isCompact bool, err error) {
lf, err := ls.locateFile(i)
if err != nil {
return
}
term, err = lf.Term(i)
return
}
func (ls *logEntryStorage) LastIndex() uint64 {
// 最后一个日志文件里没有东西
if ls.last.Len() == 0 {
if len(ls.logfiles) > 1 { // 拿上一个文件的lastIndex
return ls.last.name.index - 1
}
return 0
}
return ls.last.LastIndex()
}
func (ls *logEntryStorage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) {
if lo > ls.LastIndex() {
err = fmt.Errorf("entries's hi(%d) is out of bound lastindex(%d)", hi, ls.LastIndex())
return
}
si := ls.locate(lo)
lfs := ls.logfiles[si:]
var ent *proto.Entry
var lf *logEntryFile
i := lo
var size uint64
// 读取历史文件里的日志
for _, fn := range lfs {
if fn.index >= hi {
return
}
lf, err = ls.get(fn)
if err != nil {
return
}
for i <= lf.LastIndex() {
ent, err = lf.Get(i)
if err != nil {
return
}
if i >= hi {
return
}
size += ent.Size()
entries = append(entries, ent)
i++
if size > maxSize {
return
}
}
}
return
}
func (ls *logEntryStorage) SaveEntries(ents []*proto.Entry) error {
if len(ents) == 0 {
return nil
}
if err := ls.truncateBack(ents[0].Index); err != nil {
return err
}
for _, ent := range ents {
if err := ls.saveEntry(ent); err != nil {
return err
}
}
// flush应用层内存中的,写入file
if err := ls.last.Flush(); err != nil {
return err
}
return nil
}
func (ls *logEntryStorage) Sync() error {
return ls.last.Sync()
}
// TruncateFront 从前面截断,用于删除旧数据, 只有整个文件的数据都是旧的时才删除
func (ls *logEntryStorage) TruncateFront(index uint64) error {
truncFIndex := -1
for i := 0; i < len(ls.logfiles)-1; i++ {
if ls.logfiles[i+1].index-1 <= index {
truncFIndex = i
} else {
break
}
}
for i := 0; i <= truncFIndex; i++ {
if err := ls.remove(ls.logfiles[i]); err != nil {
return err
}
}
if truncFIndex >= 0 {
ls.logfiles = ls.logfiles[truncFIndex+1:]
}
return nil
}
// TruncateAll 清空
func (ls *logEntryStorage) TruncateAll() error {
for _, f := range ls.logfiles {
if err := ls.remove(f); err != nil {
return err
}
}
ls.nextFileSeq = 1
ls.logfiles = nil
lf, err := ls.createNew(1)
if err != nil {
return err
}
ls.last = lf
ls.logfiles = append(ls.logfiles, lf.Name())
return nil
}
// truncateBack 从后面截断,用于删除冲突日志
func (ls *logEntryStorage) truncateBack(index uint64) error {
if ls.LastIndex() < index {
return nil
}
if ls.logfiles[0].index >= index {
return ls.TruncateAll()
}
idx := ls.locate(index)
if idx == len(ls.logfiles)-1 { // 冲突位置在最后一个文件
if err := ls.last.Truncate(index); err != nil {
return err
}
} else {
for i := idx + 1; i < len(ls.logfiles); i++ {
if err := ls.remove(ls.logfiles[i]); err != nil {
return err
}
}
n := ls.logfiles[idx]
lf, err := ls.get(n)
if err != nil {
return err
}
ls.cache.Delete(n, false)
ls.last = lf
if err := ls.last.OpenWrite(); err != nil {
return err
}
if err := ls.last.Truncate(index); err != nil {
return err
}
ls.logfiles = ls.logfiles[:idx+1]
ls.nextFileSeq = n.seq + 1
}
return nil
}
func (ls *logEntryStorage) createNew(index uint64) (*logEntryFile, error) {
name := logFileName{seq: ls.nextFileSeq, index: index}
f, err := createLogEntryFile(ls.dir, name)
if err != nil {
return nil, err
}
ls.nextFileSeq++
return f, nil
}
func (ls *logEntryStorage) get(name logFileName) (*logEntryFile, error) {
if name.seq == ls.last.Seq() {
return ls.last, nil
}
return ls.cache.Get(name)
}
func (ls *logEntryStorage) remove(name logFileName) error {
_ = ls.cache.Delete(name, true)
return os.Remove(path.Join(ls.dir, name.String()))
}
// 写满了,新建一个新文件
func (ls *logEntryStorage) rotate() error {
prevLast := ls.last.LastIndex()
if err := ls.last.FinishWrite(); err != nil {
return err
}
if err := ls.last.Close(); err != nil {
return err
}
lf, err := ls.createNew(prevLast + 1)
if err != nil {
return err
}
ls.last = lf
ls.logfiles = append(ls.logfiles, lf.Name())
return nil
}
func (ls *logEntryStorage) size() int {
return len(ls.logfiles)
}
func (ls *logEntryStorage) locate(logindex uint64) int {
fi := sort.Search(len(ls.logfiles), func(i int) bool {
var nextIndex uint64
if i == len(ls.logfiles)-1 {
nextIndex = math.MaxUint64
} else {
nextIndex = ls.logfiles[i+1].index
}
return logindex < nextIndex
})
return fi
}
func (ls *logEntryStorage) locateFile(logindex uint64) (*logEntryFile, error) {
i := ls.locate(logindex)
if i >= len(ls.logfiles) {
panic("could not find log file")
}
return ls.get(ls.logfiles[i])
}
func (ls *logEntryStorage) saveEntry(ent *proto.Entry) error {
// 检查日志是否连续
prevIndex := ls.LastIndex()
if prevIndex != 0 {
if prevIndex+1 != ent.Index {
return fmt.Errorf("append discontinuous log. prev index: %d, current: %d", prevIndex, ent.Index)
}
}
// 当期文件是否已经写满
// 如果文件大小超过 MinFileSize 并且创建时间超过 MaxRotateInterval 则强制 rotate
// 能大幅减少 log index 的内存开销
woffset := ls.last.WriteOffset()
if uint64(woffset)+uint64(recordSize(ent)) > uint64(ls.filesize) ||
(woffset > MinFileSize && timeutil.GetCurrentTimeUnix()-ls.rotateTime > MaxRotateInterval) {
ls.rotateTime = timeutil.GetCurrentTimeUnix()
if err := ls.rotate(); err != nil {
return err
}
}
if err := ls.last.Save(ent); err != nil {
return err
}
return nil
}
func (ls *logEntryStorage) Close() {
if err := ls.cache.Close(); err != nil {
log.Warn("close log file cache error: %v", err)
}
if err := ls.last.Close(); err != nil {
log.Warn("close log file %s error: %v", ls.last.Name(), err)
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"encoding/binary"
"io"
"os"
"path"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util/bufalloc"
)
type truncateMeta struct {
truncIndex uint64
truncTerm uint64
}
func (m truncateMeta) Size() uint64 {
return 16
}
func (m truncateMeta) Encode(b []byte) {
binary.BigEndian.PutUint64(b, m.truncIndex)
binary.BigEndian.PutUint64(b[8:], m.truncTerm)
}
func (m *truncateMeta) Decode(b []byte) {
m.truncIndex = binary.BigEndian.Uint64(b)
m.truncTerm = binary.BigEndian.Uint64(b[8:])
}
// 存储HardState和truncateMeta信息
type metaFile struct {
f *os.File
truncOffset int64
}
func openMetaFile(dir string) (mf *metaFile, hs proto.HardState, meta truncateMeta, err error) {
f, err := os.OpenFile(path.Join(dir, "META"), os.O_RDWR|os.O_CREATE, 0600)
if err != nil {
return
}
mf = &metaFile{
f: f,
truncOffset: int64(hs.Size()),
}
hs, meta, err = mf.load()
return mf, hs, meta, err
}
func (mf *metaFile) load() (hs proto.HardState, meta truncateMeta, err error) {
// load hardstate
hs_size := int(hs.Size())
buffer := bufalloc.AllocBuffer(hs_size)
defer bufalloc.FreeBuffer(buffer)
buf := buffer.Alloc(hs_size)
n, err := mf.f.Read(buf)
if err != nil {
if err == io.EOF {
err = nil
return
}
return
}
if n != hs_size {
err = NewCorruptError("META", 0, "wrong hardstate data size")
return
}
hs.Decode(buf)
// load trunc meta
buffer.Reset()
mt_size := int(meta.Size())
buf = buffer.Alloc(mt_size)
n, err = mf.f.Read(buf)
if err != nil {
if err == io.EOF {
err = nil
return
}
return
}
if n != mt_size {
err = NewCorruptError("META", 0, "wrong truncmeta data size")
return
}
meta.Decode(buf)
return
}
func (mf *metaFile) Close() error {
return mf.f.Close()
}
func (mf *metaFile) SaveTruncateMeta(meta truncateMeta) error {
mt_size := int(meta.Size())
buffer := bufalloc.AllocBuffer(mt_size)
defer bufalloc.FreeBuffer(buffer)
b := buffer.Alloc(mt_size)
meta.Encode(b)
_, err := mf.f.WriteAt(b, mf.truncOffset)
return err
}
func (mf *metaFile) SaveHardState(hs proto.HardState) error {
hs_size := int(hs.Size())
buffer := bufalloc.AllocBuffer(hs_size)
defer bufalloc.FreeBuffer(buffer)
b := buffer.Alloc(hs_size)
hs.Encode(b)
_, err := mf.f.WriteAt(b, 0)
return err
}
func (mf *metaFile) Sync() error {
return mf.f.Sync()
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"io"
"encoding/binary"
"fmt"
)
// 日志文件({seq}.log)格式:
// [log record]
// ...
// [log record]
// [index record]
// [footer record]
// ErrCorrupt error
type ErrCorrupt struct {
filename string
offset int64
reason string
}
func (e *ErrCorrupt) Error() string {
return fmt.Sprintf("corrput data at %s:%d (%v)", e.filename, e.offset, e.reason)
}
// NewCorruptError new
func NewCorruptError(filename string, offset int64, reason string) *ErrCorrupt {
return &ErrCorrupt{
filename: filename,
offset: offset,
reason: reason,
}
}
func IsErrCorrupt(err error) (is bool) {
if err == nil {
return
}
_, is = err.(*ErrCorrupt)
return
}
type recordType uint8
const (
recTypeLogEntry recordType = 1
recTypeIndex recordType = 2
recTypeFooter recordType = 3
)
func (rt recordType) Valid() bool {
switch rt {
case recTypeLogEntry, recTypeIndex, recTypeFooter:
return true
default:
}
return false
}
func (rt recordType) String() string {
switch rt {
case recTypeLogEntry:
return "type-log"
case recTypeIndex:
return "type-index"
case recTypeFooter:
return "type-footer"
default:
return fmt.Sprintf("type-unknown(%d)", uint8(rt))
}
}
var footerMagic = []byte{'\xf9', '\xbf', '\x3e', '\x0a', '\xd3', '\xc5', '\xcc', '\x3f'}
// record格式
type record struct {
recType recordType // 字节类型
dataLen uint64 // 八字节大端数据长度
data []byte // []byte recordData.Encode()
crc uint32 // 固定四字节
}
// 一个record写入时最多需要多少字节的空间
func recordSize(data recordData) int {
return 1 + 8 + int(data.Size()) + 4
}
type recordData interface {
Encode(w io.Writer) error
Size() uint64
}
type footerRecord struct {
indexOffset uint64
magic []byte
}
func (fr footerRecord) Encode(w io.Writer) (err error) {
buf := make([]byte, 8)
binary.BigEndian.PutUint64(buf, fr.indexOffset)
if _, err = w.Write(buf); err != nil {
return
}
if _, err = w.Write(footerMagic); err != nil {
return
}
return nil
}
func (fr footerRecord) Size() uint64 {
return 16
}
func (fr *footerRecord) Decode(data []byte) {
fr.indexOffset = binary.BigEndian.Uint64(data)
fr.magic = data[8 : 8+len(footerMagic)]
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bufio"
"encoding/binary"
"io"
"math"
"os"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
// 初始化完成之后,读取记录只能调用ReadAt方法
type recordReadAt interface {
ReadAt(offset int64) (rec record, err error)
}
const defaultReadBufferedSize = 512
type bufferedReader struct {
r *bufio.Reader
}
func newBufferedReader(f *os.File) *bufferedReader {
return &bufferedReader{
r: bufio.NewReaderSize(f, defaultReadBufferedSize),
}
}
func (br *bufferedReader) Read(p []byte) (total int, err error) {
n := 0
for {
n, err = br.r.Read(p)
if err != nil {
return
}
total += n
switch {
case n == len(p):
return
case n < len(p):
p = p[n:]
default:
panic("invalid read buffer")
}
}
}
type recordReader struct {
br *bufferedReader
offset int64 // 当期记录的起始位置
sr io.ReaderAt // 随机IO
filename string
typeLenBuf []byte
}
func newRecordReader(f *os.File) *recordReader {
return &recordReader{
br: newBufferedReader(f),
sr: f,
filename: f.Name(),
typeLenBuf: make([]byte, 9), // 1字节类型+8字节dataLen
}
}
// 顺序读
func (r *recordReader) Read() (recStartOffset int64, rec record, err error) {
recStartOffset = r.offset
// read record type and data len
n, err := r.br.Read(r.typeLenBuf)
if err != nil {
return
}
if n != len(r.typeLenBuf) {
if n < 1 {
err = NewCorruptError(r.filename, recStartOffset, "too small record type")
} else {
err = NewCorruptError(r.filename, recStartOffset, "too small record datalen")
}
return
}
// Decode and validate record type
rec.recType = recordType(r.typeLenBuf[0])
if !rec.recType.Valid() {
err = NewCorruptError(r.filename, recStartOffset, "illegal record type")
return
}
// Decode and validate record data length
rec.dataLen = binary.BigEndian.Uint64(r.typeLenBuf[1:])
if rec.dataLen+4 <= 0 || rec.dataLen > math.MaxUint32 {
err = NewCorruptError(r.filename, recStartOffset, "illegal data length")
return
}
// read data and crc
// WARN:不可以用buffer pool,因为log entry等decode时没有进行拷贝
rec.data = make([]byte, rec.dataLen+4)
n, err = r.br.Read(rec.data)
if err != nil {
return
}
if uint64(n) != rec.dataLen+4 {
err = NewCorruptError(r.filename, recStartOffset, "data size unmatch or too small crc")
return
}
// decode crc
rec.crc = binary.BigEndian.Uint32(rec.data[len(rec.data)-4:])
// truncate crc
rec.data = rec.data[:len(rec.data)-4]
// checksum
crc := util.NewCRC(rec.data)
if rec.crc != crc.Value() {
err = NewCorruptError(r.filename, recStartOffset, "crc mismatch")
return
}
r.offset += (1 + 8 + int64(rec.dataLen) + 4)
return
}
// 随机读指定offset
func (r *recordReader) ReadAt(offset int64) (rec record, err error) {
defer func() {
if err == io.EOF {
err = NewCorruptError(r.filename, offset, "unexpected eof")
}
}()
// read record type and data len
n, err := r.sr.ReadAt(r.typeLenBuf, offset)
if err != nil {
return
}
if n != len(r.typeLenBuf) {
if n < 1 {
err = NewCorruptError(r.filename, offset, "too small record type")
} else {
err = NewCorruptError(r.filename, offset, "too small record datalen")
}
return
}
rec.recType = recordType(r.typeLenBuf[0])
rec.dataLen = binary.BigEndian.Uint64(r.typeLenBuf[1:])
// read data and crc
rec.data = make([]byte, rec.dataLen+4)
n, err = r.sr.ReadAt(rec.data, offset+int64(n))
if err != nil {
return
}
if uint64(n) != rec.dataLen+4 {
err = NewCorruptError(r.filename, offset, "data size unmatch or too small crc")
return
}
// decode crc
rec.crc = binary.BigEndian.Uint32(rec.data[len(rec.data)-4:])
// truncate crc
rec.data = rec.data[:len(rec.data)-4]
// checksum
crc := util.NewCRC(rec.data)
if rec.crc != crc.Value() {
err = NewCorruptError(r.filename, offset, "crc mismatch")
return
}
return
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"encoding/binary"
"io"
"os"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util/bufalloc"
)
const initialBufferSize = 1024 * 32
const flushTriggerSize = 1024 * 1024
type recordWriter struct {
f *os.File
buf bufalloc.Buffer
u64Buf []byte
u32Buf []byte
offset int64
}
func newRecordWriter(f *os.File) *recordWriter {
return &recordWriter{
f: f,
u64Buf: make([]byte, 8),
u32Buf: make([]byte, 4),
}
}
func (w *recordWriter) Write(recType recordType, data recordData) error {
if w.buf == nil {
w.buf = bufalloc.AllocBuffer(initialBufferSize)
}
w.buf.Grow(recordSize(data))
// write record type
w.buf.WriteByte(byte(recType))
// write data size
binary.BigEndian.PutUint64(w.u64Buf, data.Size())
w.buf.Write(w.u64Buf)
// write data
prevLen := w.buf.Len()
data.Encode(w.buf)
if uint64(w.buf.Len()-prevLen) != data.Size() {
panic("fbase/raft/logstorage: unexpected data size when decode " + recType.String())
}
// write crc
crc := util.NewCRC(w.buf.Bytes()[w.buf.Len()-int(data.Size()):])
binary.BigEndian.PutUint32(w.u32Buf, crc.Value())
w.buf.Write(w.u32Buf)
w.offset += int64(recordSize(data))
if err := w.tryToFlush(); err != nil {
return err
}
return nil
}
func (w *recordWriter) tryToFlush() error {
if w.buf != nil && w.buf.Len() >= flushTriggerSize {
return w.Flush()
}
return nil
}
func (w *recordWriter) Offset() int64 {
return w.offset
}
func (w *recordWriter) Truncate(offset int64) error {
if err := w.f.Truncate(offset); err != nil {
return err
}
w.offset = offset
_, err := w.f.Seek(offset, io.SeekStart)
return err
}
func (w *recordWriter) Flush() error {
if w.buf != nil && w.buf.Len() > 0 {
_, err := w.buf.WriteTo(w.f)
if err != nil {
return err
}
}
return nil
}
func (w *recordWriter) Sync() error {
if err := w.Flush(); err != nil {
return err
}
return w.f.Sync()
}
// 关闭写
func (w *recordWriter) Close() error {
if err := w.Sync(); err != nil {
return err
}
if w.buf != nil {
bufalloc.FreeBuffer(w.buf)
}
return nil
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"errors"
"fmt"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
)
// Storage the storage
type Storage struct {
c *Config
// Log Entry
ls *logEntryStorage
truncIndex uint64
truncTerm uint64
hardState proto.HardState
metafile *metaFile
prevCommit uint64 // 有commit变化时sync一下
closed bool
}
// NewStorage new
func NewStorage(dir string, c *Config) (*Storage, error) {
if err := initDir(dir); err != nil {
return nil, err
}
// 加载HardState
mf, hardState, meta, err := openMetaFile(dir)
if err != nil {
return nil, err
}
s := &Storage{
c: c.dup(),
truncIndex: meta.truncIndex,
truncTerm: meta.truncTerm,
hardState: hardState,
metafile: mf,
prevCommit: hardState.Commit,
}
// 加载日志文件
ls, err := openLogStorage(dir, s)
if err != nil {
return nil, err
}
s.ls = ls
if c.GetTruncateFirstDummy() {
if err := s.truncateFirstDummy(); err != nil {
return nil, err
}
}
return s, nil
}
func (s *Storage) truncateFirstDummy() error {
// 保证是初始化时(不能已有日志存在)
li, err := s.LastIndex()
if err != nil {
return err
}
if li != 0 {
return errors.New("truncate first dummy forbidden")
}
meta := truncateMeta{
truncIndex: 1,
truncTerm: 1,
}
if err = s.metafile.SaveTruncateMeta(meta); err != nil {
return err
}
if err = s.metafile.Sync(); err != nil {
return err
}
s.truncIndex = meta.truncIndex
s.truncTerm = meta.truncTerm
return nil
}
// InitialState returns the saved HardState information to init the repl state.
func (s *Storage) InitialState() (proto.HardState, error) {
return s.hardState, nil
}
// Entries returns a slice of log entries in the range [lo,hi), the hi is not inclusive.
// MaxSize limits the total size of the log entries returned, but Entries returns at least one entry if any.
// If lo <= CompactIndex,then return isCompact true.
// If no entries,then return entries nil.
// Note: math.MaxUint32 is no limit.
func (s *Storage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) {
if lo <= s.truncIndex {
return nil, true, nil
}
entries, isCompact, err = s.ls.Entries(lo, hi, maxSize)
return
}
// Term returns the term of entry i, which must be in the range [FirstIndex()-1, LastIndex()].
// The term of the entry before FirstIndex is retained for matching purposes even though the
// rest of that entry may not be available.
// If lo <= CompactIndex,then return isCompact true.
func (s *Storage) Term(index uint64) (term uint64, isCompact bool, err error) {
switch {
case index < s.truncIndex:
return 0, true, nil
case index == s.truncIndex:
term = s.truncTerm
return
default:
term, isCompact, err = s.ls.Term(index)
return
}
}
// FirstIndex returns the index of the first log entry that is possibly available via Entries (older entries have been incorporated
// into the latest Snapshot; if storage only contains the dummy entry the first log entry is not available).
func (s *Storage) FirstIndex() (index uint64, err error) {
index = s.truncIndex + 1
return
}
// LastIndex returns the index of the last entry in the log.
func (s *Storage) LastIndex() (index uint64, err error) {
index = s.ls.LastIndex()
if index < s.truncIndex {
index = s.truncIndex
}
return
}
// StoreEntries store the log entries to the repository.
// If first index of entries > LastIndex,then append all entries,
// Else write entries at first index and truncate the redundant log entries.
func (s *Storage) StoreEntries(entries []*proto.Entry) error {
if err := s.ls.SaveEntries(entries); err != nil {
return err
}
return nil
}
// StoreHardState store the raft state to the repository.
func (s *Storage) StoreHardState(st proto.HardState) error {
if err := s.metafile.SaveHardState(st); err != nil {
return err
}
s.hardState = st
if s.c.GetSync() {
sync := false
if st.Commit != s.prevCommit {
sync = true
s.prevCommit = st.Commit
}
if sync {
if err := s.metafile.Sync(); err != nil {
return err
}
if err := s.ls.Sync(); err != nil {
return err
}
}
}
return nil
}
// Truncate the log to index, The index is inclusive.
func (s *Storage) Truncate(index uint64) error {
if index <= s.truncIndex {
log.Warn("already truncated. index=%d", index)
return nil
}
term, isCompact, err := s.ls.Term(index)
if err != nil {
return err
}
if isCompact {
return fmt.Errorf("expected compacted term. index:%d", index)
}
// 更新meta
meta := truncateMeta{
truncIndex: index,
truncTerm: term,
}
logger.Debug("Storage truncate index %v term %v", index, term)
if err = s.metafile.SaveTruncateMeta(meta); err != nil {
return err
}
if err = s.metafile.Sync(); err != nil {
return err
}
// 截断日志文件
if err = s.ls.TruncateFront(index); err != nil {
return err
}
s.truncIndex = index
s.truncTerm = term
return nil
}
// ApplySnapshot Sync snapshot status.
func (s *Storage) ApplySnapshot(meta proto.SnapshotMeta) error {
tMeta := truncateMeta{
truncIndex: meta.Index,
truncTerm: meta.Term,
}
var err error
// 更新commit位置
s.hardState.Commit = meta.Index
if err := s.metafile.SaveHardState(s.hardState); err != nil {
return err
}
if err = s.metafile.SaveTruncateMeta(tMeta); err != nil {
return err
}
if err = s.metafile.Sync(); err != nil {
return err
}
if err = s.ls.TruncateAll(); err != nil {
return err
}
s.truncIndex = meta.Index
s.truncTerm = meta.Term
return nil
}
// Close the storage.
func (s *Storage) Close() {
if !s.closed {
s.ls.Close()
s.metafile.Close()
s.closed = true
}
}
type metricReporter struct {
ID string
}
func newReporterWithID(id string) *metricReporter {
return &metricReporter{
ID: id,
}
}
func (r *metricReporter) ReportInterval() time.Duration {
return time.Minute
}
func (r *metricReporter) Report(data []byte) error {
logger.Info("wal [%s] metrics: %s", r.ID, string(data))
return nil
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal
import (
"bytes"
"fmt"
"math/rand"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
func compapreEntry(le, re *proto.Entry) error {
if le.Index != re.Index {
return fmt.Errorf("unmatch index: %d != %d", le.Index, re.Index)
}
if le.Type != re.Type {
return fmt.Errorf("unmatch type: %d != %d", le.Type, re.Type)
}
if le.Term != re.Term {
return fmt.Errorf("unmatch term: %d != %d", le.Term, re.Term)
}
if !bytes.Equal(le.Data, re.Data) {
return fmt.Errorf("unmatch data: %s != %s", string(le.Data), string(re.Data))
}
return nil
}
func compareEntries(lh, rh []*proto.Entry) error {
if len(lh) != len(rh) {
return fmt.Errorf("unmatch size: %d != %d", len(lh), len(rh))
}
for i := 0; i < len(lh); i++ {
le := lh[i]
re := rh[i]
if err := compapreEntry(le, re); err != nil {
return fmt.Errorf("%v at %d", err, i)
}
}
return nil
}
func genLogEntry(rnd *rand.Rand, i uint64) *proto.Entry {
randType := func() proto.EntryType {
switch rnd.Int() % 2 {
case 0:
return proto.EntryNormal
default:
return proto.EntryConfChange
}
}
randTerm := func() uint64 {
return uint64(rnd.Uint32())
}
randData := func() []byte {
const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
length := 10 + rnd.Int()%100
buf := make([]byte, length)
for i := 0; i < length; i++ {
buf[i] = letters[rnd.Int()%len(letters)]
}
return buf
}
ent := &proto.Entry{
Index: i,
Type: randType(),
Term: randTerm(),
Data: randData(),
}
return ent
}
func genLogEntries(lo, hi uint64) (ents []*proto.Entry) {
rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
for i := lo; i < hi; i++ {
ents = append(ents, genLogEntry(rnd, i))
}
return
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"fmt"
"net"
"sync"
//"fmt"
//"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type heartbeatTransport struct {
config *TransportConfig
raftServer *RaftServer
listener net.Listener
mu sync.RWMutex
senders map[uint64]*transportSender
stopc chan struct{}
}
func newHeartbeatTransport(raftServer *RaftServer, config *TransportConfig) (*heartbeatTransport, error) {
var (
listener net.Listener
err error
)
if listener, err = net.Listen("tcp", config.HeartbeatAddr); err != nil {
return nil, err
}
t := &heartbeatTransport{
config: config,
raftServer: raftServer,
listener: listener,
senders: make(map[uint64]*transportSender),
stopc: make(chan struct{}),
}
return t, nil
}
func (t *heartbeatTransport) stop() {
t.mu.Lock()
defer t.mu.Unlock()
select {
case <-t.stopc:
return
default:
close(t.stopc)
t.listener.Close()
for _, s := range t.senders {
s.stop()
}
}
}
func (t *heartbeatTransport) start() {
util.RunWorkerUtilStop(func() {
for {
select {
case <-t.stopc:
return
default:
conn, err := t.listener.Accept()
if err != nil {
continue
}
t.handleConn(util.NewConnTimeout(conn))
}
}
}, t.stopc)
}
func (t *heartbeatTransport) handleConn(conn *util.ConnTimeout) {
util.RunWorker(func() {
defer conn.Close()
bufRd := util.NewBufferReader(conn, 16*KB)
for {
select {
case <-t.stopc:
return
default:
if msg, err := reciveMessage(bufRd); err != nil {
logger.Error(fmt.Sprintf("[heartbeatTransport] recive message from conn error, %s", err.Error()))
return
} else {
//logger.Debug(fmt.Sprintf("Recive %v from (%v)", msg.ToString(), conn.RemoteAddr()))
t.raftServer.reciveMessage(msg)
}
}
}
})
}
func (t *heartbeatTransport) send(msg *proto.Message) {
s := t.getSender(msg.To)
s.send(msg)
}
func (t *heartbeatTransport) getSender(nodeId uint64) *transportSender {
t.mu.RLock()
sender, ok := t.senders[nodeId]
t.mu.RUnlock()
if ok {
return sender
}
t.mu.Lock()
defer t.mu.Unlock()
if sender, ok = t.senders[nodeId]; !ok {
sender = newTransportSender(nodeId, 1, 64, HeartBeat, t.config.Resolver)
t.senders[nodeId] = sender
}
return sender
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type MultiTransport struct {
heartbeat *heartbeatTransport
replicate *replicateTransport
}
func NewMultiTransport(raft *RaftServer, config *TransportConfig) (Transport, error) {
mt := new(MultiTransport)
if ht, err := newHeartbeatTransport(raft, config); err != nil {
return nil, err
} else {
mt.heartbeat = ht
}
if rt, err := newReplicateTransport(raft, config); err != nil {
return nil, err
} else {
mt.replicate = rt
}
mt.heartbeat.start()
mt.replicate.start()
return mt, nil
}
func (t *MultiTransport) Stop() {
t.heartbeat.stop()
t.replicate.stop()
}
func (t *MultiTransport) Send(m *proto.Message) {
// if m.IsElectionMsg() {
if m.IsHeartbeatMsg() {
t.heartbeat.send(m)
} else {
t.replicate.send(m)
}
}
func (t *MultiTransport) SendSnapshot(m *proto.Message, rs *snapshotStatus) {
t.replicate.sendSnapshot(m, rs)
}
func reciveMessage(r *util.BufferReader) (msg *proto.Message, err error) {
msg = proto.GetMessage()
if err = msg.Decode(r); err != nil {
proto.ReturnMessage(msg)
}
return
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"encoding/binary"
"fmt"
"io"
"net"
"runtime"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type replicateTransport struct {
config *TransportConfig
raftServer *RaftServer
listener net.Listener
curSnapshot int32
mu sync.RWMutex
senders map[uint64]*transportSender
stopc chan struct{}
}
func newReplicateTransport(raftServer *RaftServer, config *TransportConfig) (*replicateTransport, error) {
var (
listener net.Listener
err error
)
if listener, err = net.Listen("tcp", config.ReplicateAddr); err != nil {
return nil, err
}
t := &replicateTransport{
config: config,
raftServer: raftServer,
listener: listener,
senders: make(map[uint64]*transportSender),
stopc: make(chan struct{}),
}
return t, nil
}
func (t *replicateTransport) stop() {
t.mu.Lock()
defer t.mu.Unlock()
select {
case <-t.stopc:
return
default:
close(t.stopc)
t.listener.Close()
for _, s := range t.senders {
s.stop()
}
}
}
func (t *replicateTransport) send(m *proto.Message) {
s := t.getSender(m.To)
s.send(m)
}
func (t *replicateTransport) getSender(nodeId uint64) *transportSender {
t.mu.RLock()
sender, ok := t.senders[nodeId]
t.mu.RUnlock()
if ok {
return sender
}
t.mu.Lock()
defer t.mu.Unlock()
if sender, ok = t.senders[nodeId]; !ok {
sender = newTransportSender(nodeId, uint64(t.config.MaxReplConcurrency), t.config.SendBufferSize, Replicate, t.config.Resolver)
t.senders[nodeId] = sender
}
return sender
}
func (t *replicateTransport) sendSnapshot(m *proto.Message, rs *snapshotStatus) {
var (
conn *util.ConnTimeout
err error
)
defer func() {
atomic.AddInt32(&t.curSnapshot, -1)
rs.respond(err)
if conn != nil {
conn.Close()
}
if err != nil {
logger.Error("[Transport] %v send snapshot to %v failed error is: %v.", m.ID, m.To, err)
} else if logger.IsEnableWarn() {
logger.Warn("[Transport] %v send snapshot to %v successful.", m.ID, m.To)
}
}()
if atomic.AddInt32(&t.curSnapshot, 1) > int32(t.config.MaxSnapConcurrency) {
err = fmt.Errorf("snapshot concurrency exceed the limit %v, now %d", t.config.MaxSnapConcurrency, t.curSnapshot)
return
}
if conn = getConn(m.To, Replicate, t.config.Resolver, 10*time.Minute, 1*time.Minute); conn == nil {
err = fmt.Errorf("can't get connection to %v.", m.To)
return
}
// send snapshot header message
bufWr := util.NewBufferWriter(conn, 1*MB)
if err = m.Encode(bufWr); err != nil {
return
}
if err = bufWr.Flush(); err != nil {
return
}
// send snapshot data
var (
data []byte
loopCount = 0
sizeBuf = make([]byte, 4)
)
for err == nil {
loopCount = loopCount + 1
if loopCount > 16 {
loopCount = 0
runtime.Gosched()
}
select {
case <-rs.stopCh:
err = fmt.Errorf("raft has shutdown.")
default:
data, err = m.Snapshot.Next()
if len(data) > 0 {
// write block size
binary.BigEndian.PutUint32(sizeBuf, uint32(len(data)))
if _, err = bufWr.Write(sizeBuf); err == nil {
_, err = bufWr.Write(data)
}
}
}
}
// write end flag and flush
if err != nil && err != io.EOF {
return
}
binary.BigEndian.PutUint32(sizeBuf, 0)
if _, err = bufWr.Write(sizeBuf); err != nil {
return
}
if err = bufWr.Flush(); err != nil {
return
}
// wait response
err = nil
resp := make([]byte, 1)
io.ReadFull(conn, resp)
if resp[0] != 1 {
err = fmt.Errorf("follower response failed.")
}
}
func (t *replicateTransport) start() {
util.RunWorkerUtilStop(func() {
for {
select {
case <-t.stopc:
return
default:
conn, err := t.listener.Accept()
if err != nil {
continue
}
t.handleConn(util.NewConnTimeout(conn))
}
}
}, t.stopc)
}
func (t *replicateTransport) handleConn(conn *util.ConnTimeout) {
util.RunWorker(func() {
defer conn.Close()
loopCount := 0
bufRd := util.NewBufferReader(conn, 16*KB)
for {
loopCount = loopCount + 1
if loopCount > 16 {
loopCount = 0
runtime.Gosched()
}
select {
case <-t.stopc:
return
default:
if msg, err := reciveMessage(bufRd); err != nil {
return
} else {
//logger.Debug(fmt.Sprintf("Recive %v from (%v)", msg.ToString(), conn.RemoteAddr()))
if msg.Type == proto.ReqMsgSnapShot {
if err := t.handleSnapshot(msg, conn, bufRd); err != nil {
return
}
} else {
t.raftServer.reciveMessage(msg)
}
}
}
}
})
}
var snap_ack = []byte{1}
func (t *replicateTransport) handleSnapshot(m *proto.Message, conn *util.ConnTimeout, bufRd *util.BufferReader) error {
conn.SetReadTimeout(time.Minute)
conn.SetWriteTimeout(15 * time.Second)
bufRd.Grow(1 * MB)
req := newSnapshotRequest(m, bufRd)
t.raftServer.reciveSnapshot(req)
// wait snapshot result
if err := req.response(); err != nil {
logger.Error("[Transport] handle snapshot request from %v error: %v.", m.From, err)
return err
}
_, err := conn.Write(snap_ack)
return err
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package raft
import (
"runtime"
"sync"
"time"
//"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
type unreachableReporter func(uint64)
type transportSender struct {
nodeID uint64
concurrency uint64
senderType SocketType
resolver SocketResolver
inputc []chan *proto.Message
send func(msg *proto.Message)
mu sync.Mutex
stopc chan struct{}
}
func newTransportSender(nodeID, concurrency uint64, buffSize int, senderType SocketType, resolver SocketResolver) *transportSender {
sender := &transportSender{
nodeID: nodeID,
concurrency: concurrency,
senderType: senderType,
resolver: resolver,
inputc: make([]chan *proto.Message, concurrency),
stopc: make(chan struct{}),
}
for i := uint64(0); i < concurrency; i++ {
sender.inputc[i] = make(chan *proto.Message, buffSize)
sender.loopSend(sender.inputc[i])
}
if (concurrency & (concurrency - 1)) == 0 {
sender.send = func(msg *proto.Message) {
idx := 0
if concurrency > 1 {
idx = int(msg.ID&concurrency - 1)
}
sender.inputc[idx] <- msg
}
} else {
sender.send = func(msg *proto.Message) {
idx := 0
if concurrency > 1 {
idx = int(msg.ID % concurrency)
}
sender.inputc[idx] <- msg
}
}
return sender
}
func (s *transportSender) stop() {
s.mu.Lock()
defer s.mu.Unlock()
select {
case <-s.stopc:
return
default:
close(s.stopc)
}
}
func (s *transportSender) loopSend(recvc chan *proto.Message) {
util.RunWorkerUtilStop(func() {
conn := getConn(s.nodeID, s.senderType, s.resolver, 0, 2*time.Second)
bufWr := util.NewBufferWriter(conn, 16*KB)
defer func() {
if conn != nil {
conn.Close()
}
}()
loopCount := 0
var err error
for {
loopCount = loopCount + 1
if loopCount > 8 {
loopCount = 0
runtime.Gosched()
}
select {
case <-s.stopc:
return
case msg := <-recvc:
if conn == nil {
conn = getConn(s.nodeID, s.senderType, s.resolver, 0, 2*time.Second)
if conn == nil {
proto.ReturnMessage(msg)
// reset chan
for {
select {
case msg := <-recvc:
proto.ReturnMessage(msg)
continue
default:
}
break
}
time.Sleep(50 * time.Millisecond)
continue
}
bufWr.Reset(conn)
}
err = msg.Encode(bufWr)
proto.ReturnMessage(msg)
if err != nil {
goto flush
}
// group send message
flag := false
for i := 0; i < 16; i++ {
select {
case msg := <-recvc:
err = msg.Encode(bufWr)
//logger.Debug(fmt.Sprintf("SendMesg %v to (%v) ", msg.ToString(), conn.RemoteAddr()))
proto.ReturnMessage(msg)
if err != nil {
goto flush
}
default:
flag = true
}
if flag {
break
}
}
}
flush:
// flush write
if err == nil {
err = bufWr.Flush()
}
if err != nil {
logger.Error("[Transport]send message[%s] to %v[%s] error:[%v].", s.senderType, s.nodeID, conn.RemoteAddr(), err)
conn.Close()
conn = nil
}
}
}, s.stopc)
}
func getConn(nodeID uint64, socketType SocketType, resolver SocketResolver, rdTime, wrTime time.Duration) (conn *util.ConnTimeout) {
var (
addr string
err error
)
if addr, err = resolver.NodeAddress(nodeID, socketType); err == nil {
if conn, err = util.DialTimeout(addr, 2*time.Second); err == nil {
conn.SetReadTimeout(rdTime)
conn.SetWriteTimeout(wrTime)
}
}
if err != nil {
conn = nil
if logger.IsEnableDebug() {
logger.Debug("[Transport] get connection[%s] to %v[%s] failed,error is: %s", socketType, nodeID, addr, err)
}
}
return
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import "sync/atomic"
type AtomicBool struct {
v int32
}
func (b *AtomicBool) Get() bool {
return atomic.LoadInt32(&b.v) != 0
}
func (b *AtomicBool) Set(newValue bool) {
atomic.StoreInt32(&b.v, boolToInt(newValue))
}
func (b *AtomicBool) CompareAndSet(expect, update bool) bool {
return atomic.CompareAndSwapInt32(&b.v, boolToInt(expect), boolToInt(update))
}
func boolToInt(v bool) int32 {
if v {
return 1
} else {
return 0
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import "sync/atomic"
type AtomicUInt64 struct {
v uint64
}
func (a *AtomicUInt64) Get() uint64 {
return atomic.LoadUint64(&a.v)
}
func (a *AtomicUInt64) Set(v uint64) {
atomic.StoreUint64(&a.v, v)
}
func (a *AtomicUInt64) Add(v uint64) uint64 {
return atomic.AddUint64(&a.v, v)
}
func (a *AtomicUInt64) Incr() uint64 {
return atomic.AddUint64(&a.v, 1)
}
func (a *AtomicUInt64) CompareAndSwap(o, n uint64) bool {
return atomic.CompareAndSwapUint64(&a.v, o, n)
}
// Copyright 2009 The Go Authors. All rights reserved.
// Modified work copyright 2018 The tiglabs Authors.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bufalloc
import (
"io"
)
// A Buffer is a variable-sized buffer of bytes with Read and Write methods.
type Buffer interface {
// Alloc allocs n bytes of slice from the buffer, growing the buffer as needed.
// If n is negative, Alloc will panic. If the buffer can't grow it will panic with bytes.ErrTooLarge.
Alloc(n int) []byte
// Truncate discards all but the first n unread bytes from the buffer.
// It panics if n is negative or greater than the length of the buffer.
Truncate(n int)
// Grow grows the buffer's capacity, if necessary, to guarantee space for n bytes.
// If n is negative, Grow will panic. If the buffer can't grow it will panic with bytes.ErrTooLarge.
Grow(n int)
// Write appends the contents of p to the buffer, growing the buffer as needed.
// The return value n is the length of p; err is always nil.
// If the buffer becomes too large, Write will panic with bytes.ErrTooLarge.
Write(p []byte) (n int, err error)
// WriteByte appends the byte c to the buffer, growing the buffer as needed.
// If the buffer becomes too large, WriteByte will panic with bytes.ErrTooLarge.
WriteByte(c byte) error
// WriteTo writes data to w until the buffer is drained or an error occurs.
// The return value n is the number of bytes written;
// Any error encountered during the write is also returned.
WriteTo(w io.Writer) (n int64, err error)
// Read reads the next len(p) bytes from the buffer or until the buffer is drained.
// The return value n is the number of bytes read.
// If the buffer has no data to return, err is io.EOF (unless len(p) is zero); otherwise it is nil.
Read(p []byte) (n int, err error)
// ReadByte reads and returns the next byte from the buffer.
// If no byte is available, it returns error io.EOF.
ReadByte() (c byte, err error)
// ReadBytes reads until the first occurrence of delim in the input,
// returning a slice containing the data up to and including the delimiter.
// If ReadBytes encounters an error before finding a delimiter, it returns the data read before the error and the error itself (often io.EOF).
// ReadBytes returns err != nil if and only if the returned data does not end in delim.
ReadBytes(delim byte) (line []byte, err error)
// ReadFrom reads data from r until EOF and appends it to the buffer, growing the buffer as needed.
// The return value n is the number of bytes read. Any error except io.EOF encountered during the read is also returned.
// If the buffer becomes too large, ReadFrom will panic with bytes.ErrTooLarge.
ReadFrom(r io.Reader) (n int64, err error)
// Bytes returns a slice of the contents of the unread portion of the buffer;
// If the caller changes the contents of the returned slice, the contents of the buffer will change,
// provided there are no intervening method calls on the Buffer.
Bytes() []byte
// Next returns a slice containing the next n bytes from the buffer, advancing the buffer as if the bytes had been returned by Read.
// If there are fewer than n bytes in the buffer, Next returns the entire buffer.
// The slice is only valid until the next call to a read or write method.
Next(n int) []byte
// Reset resets the buffer so it has no content.
// b.Reset() is the same as b.Truncate(0).
Reset()
// String returns the contents of the unread portion of the buffer as a string.
// If the Buffer is a nil pointer, it returns "<nil>".
String() string
// Len returns the number of bytes of the unread portion of the buffer;
Len() int
// Cap returns the capacity of the buffer.
Cap() int
}
func AllocBuffer(n int) Buffer {
return buffPool.getBuffer(n)
}
func FreeBuffer(buf Buffer) {
buffPool.putBuffer(buf)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bufalloc
import (
"sync"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)
const (
baseSize = 15
bigSize = 64 * util.KB
)
var buffPool *bufferPool
func init() {
buffPool = &bufferPool{
baseline: [...]int{64, 128, 256, 512, util.KB, 2 * util.KB, 4 * util.KB, 8 * util.KB, 16 * util.KB, 32 * util.KB, 64 * util.KB, 128 * util.KB, 256 * util.KB, 512 * util.KB, util.MB},
}
for i, n := range buffPool.baseline {
buffPool.pool[i] = createPool(n)
}
buffPool.pool[baseSize] = createPool(0)
}
func createPool(n int) *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if n == 0 || n > bigSize {
return &ibuffer{}
}
return &ibuffer{buf: makeSlice(n)}
},
}
}
type bufferPool struct {
baseline [baseSize]int
pool [baseSize + 1]*sync.Pool
}
func (p *bufferPool) getPoolNum(n int) int {
for i, x := range p.baseline {
if n <= x {
return i
}
}
return baseSize
}
func (p *bufferPool) getBuffer(n int) Buffer {
num := p.getPoolNum(n)
pool := p.pool[num]
buf := pool.Get().(Buffer)
if buf.Cap() < n {
// return old buffer to pool
buffPool.putBuffer(buf)
buf = &ibuffer{buf: makeSlice(n)}
}
buf.Reset()
return buf
}
func (p *bufferPool) putBuffer(buf Buffer) {
num := p.getPoolNum(buf.Cap())
pool := p.pool[num]
pool.Put(buf)
}
// Copyright 2009 The Go Authors. All rights reserved.
// Modified work copyright 2018 The tiglabs Authors.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bufalloc
import (
"bytes"
"errors"
"io"
)
const minRead = 512
var (
ErrTooLarge = errors.New("bufalloc.Buffer: too large.")
)
type ibuffer struct {
off int
buf []byte
}
func makeSlice(n int) []byte {
defer func() {
if recover() != nil {
panic(ErrTooLarge)
}
}()
return make([]byte, n)
}
func (b *ibuffer) Bytes() []byte { return b.buf[b.off:] }
func (b *ibuffer) String() string {
if b == nil {
return "<nil>"
}
return string(b.buf[b.off:])
}
func (b *ibuffer) Len() int { return len(b.buf) - b.off }
func (b *ibuffer) Cap() int { return cap(b.buf) }
func (b *ibuffer) Reset() { b.Truncate(0) }
func (b *ibuffer) Truncate(n int) {
switch {
case n < 0 || n > b.Len():
panic("bufalloc.Buffer: truncation out of range")
case n == 0:
b.off = 0
}
b.buf = b.buf[0 : b.off+n]
}
func (b *ibuffer) grow(n int) int {
if b.buf == nil {
b.buf = makeSlice(n)
return 0
}
m := b.Len()
if m == 0 && b.off != 0 {
b.Truncate(0)
}
if len(b.buf)+n > cap(b.buf) {
var buf []byte
if m+n <= cap(b.buf)/2 {
copy(b.buf[:], b.buf[b.off:])
buf = b.buf[:m]
} else {
buf = makeSlice(2*cap(b.buf) + n)
copy(buf, b.buf[b.off:])
}
b.buf = buf
b.off = 0
}
b.buf = b.buf[0 : b.off+m+n]
return b.off + m
}
func (b *ibuffer) Alloc(n int) []byte {
if n < 0 {
panic("bufalloc.Buffer: negative count")
}
m := b.grow(n)
return b.buf[m:]
}
func (b *ibuffer) Grow(n int) {
if n < 0 {
panic("bufalloc.Buffer: negative count")
}
m := b.grow(n)
b.buf = b.buf[0:m]
}
func (b *ibuffer) Write(p []byte) (n int, err error) {
m := b.grow(len(p))
return copy(b.buf[m:], p), nil
}
func (b *ibuffer) ReadFrom(r io.Reader) (n int64, err error) {
if b.off >= len(b.buf) {
b.Truncate(0)
}
for {
if free := cap(b.buf) - len(b.buf); free < minRead {
// not enough space at end
newBuf := b.buf
if b.off+free < minRead {
newBuf = makeSlice(2*cap(b.buf) + minRead)
}
copy(newBuf, b.buf[b.off:])
b.buf = newBuf[:len(b.buf)-b.off]
b.off = 0
}
m, e := r.Read(b.buf[len(b.buf):cap(b.buf)])
b.buf = b.buf[0 : len(b.buf)+m]
n += int64(m)
if e == io.EOF {
break
}
if e != nil {
return n, e
}
}
return n, nil // err is EOF, so return nil explicitly
}
func (b *ibuffer) WriteTo(w io.Writer) (n int64, err error) {
if b.off < len(b.buf) {
nBytes := b.Len()
m, e := w.Write(b.buf[b.off:])
if m > nBytes {
panic("bufalloc.Buffer: invalid Write count")
}
b.off += m
n = int64(m)
if e != nil {
return n, e
}
if m != nBytes {
return n, io.ErrShortWrite
}
}
b.Truncate(0)
return
}
func (b *ibuffer) WriteByte(c byte) error {
m := b.grow(1)
b.buf[m] = c
return nil
}
func (b *ibuffer) Read(p []byte) (n int, err error) {
if b.off >= len(b.buf) {
b.Truncate(0)
if len(p) == 0 {
return
}
return 0, io.EOF
}
n = copy(p, b.buf[b.off:])
b.off += n
return
}
func (b *ibuffer) Next(n int) []byte {
m := b.Len()
if n > m {
n = m
}
data := b.buf[b.off : b.off+n]
b.off += n
return data
}
func (b *ibuffer) ReadByte() (c byte, err error) {
if b.off >= len(b.buf) {
b.Truncate(0)
return 0, io.EOF
}
c = b.buf[b.off]
b.off++
return c, nil
}
func (b *ibuffer) ReadBytes(delim byte) (line []byte, err error) {
slice, err := b.readSlice(delim)
line = append(line, slice...)
return
}
func (b *ibuffer) readSlice(delim byte) (line []byte, err error) {
i := bytes.IndexByte(b.buf[b.off:], delim)
end := b.off + i + 1
if i < 0 {
end = len(b.buf)
err = io.EOF
}
line = b.buf[b.off:end]
b.off = end
return line, err
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"net"
"time"
)
type ConnTimeout struct {
addr string
conn net.Conn
readTime time.Duration
writeTime time.Duration
}
func DialTimeout(addr string, connTime time.Duration) (*ConnTimeout, error) {
conn, err := net.DialTimeout("tcp", addr, connTime)
if err != nil {
return nil, err
}
conn.(*net.TCPConn).SetNoDelay(true)
conn.(*net.TCPConn).SetLinger(0)
conn.(*net.TCPConn).SetKeepAlive(true)
return &ConnTimeout{conn: conn, addr: addr}, nil
}
func NewConnTimeout(conn net.Conn) *ConnTimeout {
if conn == nil {
return nil
}
conn.(*net.TCPConn).SetNoDelay(true)
conn.(*net.TCPConn).SetLinger(0)
conn.(*net.TCPConn).SetKeepAlive(true)
return &ConnTimeout{conn: conn, addr: conn.RemoteAddr().String()}
}
func (c *ConnTimeout) SetReadTimeout(timeout time.Duration) {
c.readTime = timeout
}
func (c *ConnTimeout) SetWriteTimeout(timeout time.Duration) {
c.writeTime = timeout
}
func (c *ConnTimeout) Read(p []byte) (n int, err error) {
if c.readTime.Nanoseconds() > 0 {
err = c.conn.SetReadDeadline(time.Now().Add(c.readTime))
if err != nil {
return
}
}
n, err = c.conn.Read(p)
return
}
func (c *ConnTimeout) Write(p []byte) (n int, err error) {
if c.writeTime.Nanoseconds() > 0 {
err = c.conn.SetWriteDeadline(time.Now().Add(c.writeTime))
if err != nil {
return
}
}
n, err = c.conn.Write(p)
return
}
func (c *ConnTimeout) RemoteAddr() string {
return c.addr
}
func (c *ConnTimeout) Close() error {
return c.conn.Close()
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"hash/crc32"
)
var table = crc32.MakeTable(crc32.Castagnoli)
// CRC is a CRC-32 checksum computed using Castagnoli's polynomial.
type CRC uint32
// NewCRC creates a new crc based on the given bytes.
func NewCRC(b []byte) CRC {
return CRC(0).Update(b)
}
// Update updates the crc with the given bytes.
func (c CRC) Update(b []byte) CRC {
return CRC(crc32.Update(uint32(c), table, b))
}
// Value returns a masked crc.
func (c CRC) Value() uint32 {
return uint32(c>>15|c<<17) + 0xa282ead8
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"bufio"
"errors"
"io"
)
var (
maxEmptyReads = 100
err_reader_isnil = errors.New("BufferReader: reader is nil!")
err_negative_count = errors.New("BufferReader: read return negative count!")
err_no_progress = errors.New("BufferReader: multiple Read calls return no data or error!")
err_too_large = errors.New("BufferReader: make byte slice too large!")
)
type BufferReader struct {
buf []byte
reader io.Reader
size int
r, w int
err error
}
func NewBufferReader(reader io.Reader, size int) *BufferReader {
return &BufferReader{
reader: reader,
size: size,
buf: make([]byte, size),
}
}
func (br *BufferReader) Reset() {
if br.w > br.r {
copy(br.buf, br.buf[br.r:br.w])
}
br.w = br.w - br.r
br.r = 0
}
func (br *BufferReader) ReadFull(min int) (data []byte, err error) {
if br.reader == nil {
return nil, err_reader_isnil
}
if min == 0 {
err = br.err
br.err = nil
return make([]byte, 0, 0), err
}
if min > (cap(br.buf) - br.r) {
br.Grow(min)
}
for (br.w-br.r) < min && err == nil {
br.fill()
err = br.err
}
if (br.w - br.r) >= min {
data = br.buf[br.r : br.r+min]
br.r = br.r + min
err = nil
} else {
data = br.buf[br.r:br.w]
br.r = br.w
err = br.err
br.err = nil
}
return
}
func (br *BufferReader) fill() {
if br.w >= cap(br.buf) {
br.Grow(br.w - br.r)
}
for i := maxEmptyReads; i > 0; i-- {
n, err := br.reader.Read(br.buf[br.w:])
if n < 0 {
panic(err_negative_count)
}
br.w = br.w + n
if err != nil {
br.err = err
return
}
if n > 0 {
return
}
}
br.err = err_no_progress
}
func (br *BufferReader) Grow(n int) {
defer func() {
if recover() != nil {
panic(err_too_large)
}
}()
var buf []byte
if n > br.size {
buf = make([]byte, n)
} else {
buf = make([]byte, br.size)
}
if br.w > br.r {
copy(buf, br.buf[br.r:br.w])
}
br.w = br.w - br.r
br.r = 0
br.buf = buf
}
type BufferWriter struct {
*bufio.Writer
}
func NewBufferWriter(wr io.Writer, size int) *BufferWriter {
return &BufferWriter{
Writer: bufio.NewWriterSize(wr, size),
}
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log
import (
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"math"
"os"
"path"
"runtime"
"sort"
"strings"
"sync"
"syscall"
"time"
)
const (
// A colon appears after these items: 2009/01/23 01:23:23.123123 /a/b/c/d.go:23: message
Ldate = 1 << iota // the date: 2009/01/23
Ltime // the time: 01:23:23
Lmicroseconds // microsecond resolution: 01:23:23.123123. assumes Ltime.
Llongfile // full file name and line number: /a/b/c/d.go:23
Lshortfile // final file name element and line number: d.go:23. overrides Llongfile
LstdFlags = Ldate | Ltime // initial values for the standard logger
LogFileNameDateFormat = "200601021504"
LogMaxReservedDays = 7 * 24 * time.Hour
// DefaultRollingSize Specifies at what size to roll the output log at, Units: MB
DefaultRollingSize = 5 * 1024
DefaultMinRollingSize = 200
// DefaultHeadRoom The tolerance for the log space limit, Units: MB
DefaultHeadRoom = 50 * 1024
// DefaultHeadRatio The disk reserve space ratio
DefaultHeadRatio = 0.2
)
var (
errLogFileName = "_err.log"
warnLogFileName = "_warn.log"
infoLogFileName = "_info.log"
debugLogFileName = "_debug.log"
)
type logWriter struct {
mu sync.Mutex // ensures atomic writes; protects the following fields
prefix string // prefix to write at beginning of each line
flag int // properties
out io.WriteCloser // destination for output
buf []byte // for accumulating text to write
}
func newLogWriter(out io.WriteCloser, prefix string, flag int) *logWriter {
return &logWriter{out: out, prefix: prefix, flag: flag}
}
type RolledFile []os.FileInfo
func (f RolledFile) Less(i, j int) bool {
return f[i].ModTime().Before(f[j].ModTime())
}
func (f RolledFile) Len() int {
return len(f)
}
func (f RolledFile) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}
func itoa(buf *[]byte, i int, wid int) {
var u uint = uint(i)
if u == 0 && wid <= 1 {
*buf = append(*buf, '0')
return
}
// Assemble decimal in reverse order.
var b [32]byte
bp := len(b)
for ; u > 0 || wid > 0; u /= 10 {
bp--
wid--
b[bp] = byte(u%10) + '0'
}
*buf = append(*buf, b[bp:]...)
}
func (l *logWriter) formatHeader(buf *[]byte, t time.Time, file string, line int) {
*buf = append(*buf, l.prefix...)
if l.flag&(Ldate|Ltime|Lmicroseconds) != 0 {
if l.flag&Ldate != 0 {
year, month, day := t.Date()
itoa(buf, year, 4)
*buf = append(*buf, '-')
itoa(buf, int(month), 2)
*buf = append(*buf, '-')
itoa(buf, day, 2)
*buf = append(*buf, ' ')
}
if l.flag&(Ltime|Lmicroseconds) != 0 {
hour, min, sec := t.Clock()
itoa(buf, hour, 2)
*buf = append(*buf, ':')
itoa(buf, min, 2)
*buf = append(*buf, ':')
itoa(buf, sec, 2)
if l.flag&Lmicroseconds != 0 {
*buf = append(*buf, ',')
itoa(buf, t.Nanosecond()/1e6, 3)
}
*buf = append(*buf, ' ')
}
}
if l.flag&(Lshortfile|Llongfile) != 0 {
if l.flag&Lshortfile != 0 {
short := file
for i := len(file) - 1; i > 0; i-- {
if file[i] == '/' {
short = file[i+1:]
break
}
}
file = short
}
*buf = append(*buf, file...)
*buf = append(*buf, ':')
itoa(buf, line, -1)
*buf = append(*buf, ": "...)
}
}
func (l *logWriter) output(s string, file string, line int, now time.Time) error {
l.mu.Lock()
defer l.mu.Unlock()
l.buf = l.buf[:0]
l.formatHeader(&l.buf, now, file, line)
l.buf = append(l.buf, s...)
if len(s) > 0 && s[len(s)-1] != '\n' {
l.buf = append(l.buf, '\n')
}
_, err := l.out.Write(l.buf)
return err
}
func (lw *logWriter) rotateFile(logDir, logFile, module string, rotate bool) {
lw.mu.Lock()
defer lw.mu.Unlock()
if lw.out != nil {
lw.out.Close()
}
file, err := lw.createFile(logDir, logFile, module, rotate)
if err != nil {
file = os.Stdout
}
lw.out = file
if err == nil && logFile == errLogFileName {
if f, e := file.Stat(); e == nil && f.Size() == 0 {
// Write header.
var buf bytes.Buffer
fmt.Fprintf(&buf, "Log file created at: %s\n", time.Now().Format("2006/01/02 15:04:05"))
fmt.Fprintf(&buf, "Log line format: yyyy-mm-dd hh:mm:ss.uuuuuu[DIWE] file:line: msg\n")
fmt.Fprintf(&buf, "####################################################################\n\n")
lw.out.Write(buf.Bytes())
}
}
}
func (lw *logWriter) createFile(logDir, logFile, module string, rotate bool) (*os.File, error) {
if _, err := os.Stat(logDir); err != nil && os.IsNotExist(err) {
if err = os.MkdirAll(logDir, os.ModePerm); err != nil {
fmt.Printf("[Util.Logger]Create logger dir[%s] err: [%s]\r\n", logDir, err)
}
}
logFileOpt := os.O_RDWR | os.O_CREATE | os.O_APPEND
logFilePath := logDir + "/" + module + logFile
if rotate {
yesterday := time.Now().AddDate(0, 0, -1)
os.Rename(logFilePath, logFilePath+"."+yesterday.Format(LogFileNameDateFormat))
}
file, err := os.OpenFile(logFilePath, logFileOpt, os.ModePerm)
if err != nil {
fmt.Printf("[Util.Logger]Create logger file[%s] err: [%s]\r\n", logFilePath, err)
}
return file, err
}
func (lw *logWriter) checkRollingSize(logDir, logFile, module string, rollingSizeMB int64) {
logFilePath := path.Join(logDir, module+logFile)
fInfo, err := os.Stat(logFilePath)
if err == nil {
if fInfo.Size() >= rollingSizeMB*1024*1024 {
lw.rotateFile(logDir, logFile, module, true)
}
}
}
const (
TraceLevel = 0
DebugLevel = 1
InfoLevel = 2
WarnLevel = 3
ErrorLevel = 4
FatalLevel = 5
)
var levels = []string{
"[TRACE]",
"[DEBUG]",
"[INFO.]",
"[WARN.]",
"[ERROR]",
"[FATAL]",
}
type entity struct {
msg string
now time.Time
file string
line int
}
type Log struct {
dir string
module string
level int
startTime time.Time
flag int
err *logWriter
warn *logWriter
info *logWriter
debug *logWriter
entityCh chan *entity
rollingSizeMB int64 // the size of the rotated log, unit: MB
headRoomMB int64 // capacity reserved for writing the next log on the disk, unit: MB
}
var glog *Log = NewDefaultLog()
func NewDefaultLog() *Log {
log, err := NewLog("", "", "DEBUG")
if err != nil {
panic(err)
}
return log
}
func NewLog(dir, module, level string) (*Log, error) {
lg := new(Log)
lg.dir = dir
lg.module = module
lg.SetLevel(level)
if err := lg.initLog(dir, module); err != nil {
return nil, err
}
lg.startTime = time.Now()
lg.entityCh = make(chan *entity, 204800)
if dir != "" {
if err := lg.SetRotate(dir); err != nil {
return nil, err
}
go lg.checkLogRotation(dir, module)
go lg.checkCleanLog(dir, module)
}
go lg.loopMsg()
return lg, nil
}
func InitFileLog(dir, module, level string) {
log, err := NewLog(dir, module, level)
if err != nil {
panic(err)
}
glog = log
}
func GetFileLogger() *Log {
return glog
}
func (l *Log) initLog(logDir, module string) error {
logOpt := Lshortfile | LstdFlags | Lmicroseconds
if logDir == "" {
l.debug = newLogWriter(os.Stdout, "", logOpt)
l.info = newLogWriter(os.Stdout, "", logOpt)
l.warn = newLogWriter(os.Stdout, "", logOpt)
l.err = newLogWriter(os.Stdout, "", logOpt)
return nil
}
if fi, err := os.Stat(logDir); err != nil {
return err
} else if !fi.IsDir() {
return errors.New(logDir + " is not a directory")
}
l.flag = logOpt
l.debug = newLogWriter(nil, "", logOpt)
l.info = newLogWriter(nil, "", logOpt)
l.warn = newLogWriter(nil, "", logOpt)
l.err = newLogWriter(nil, "", logOpt)
l.debug.rotateFile(logDir, debugLogFileName, module, false)
l.info.rotateFile(logDir, infoLogFileName, module, false)
l.warn.rotateFile(logDir, warnLogFileName, module, false)
l.err.rotateFile(logDir, errLogFileName, module, false)
return nil
}
func (l *Log) SetLevel(level string) {
switch level {
case "TRACE", "trace", "Trace":
l.level = TraceLevel
case "", "debug", "Debug", "DEBUG":
l.level = DebugLevel
case "info", "Info", "INFO":
l.level = InfoLevel
case "warn", "Warn", "WARN":
l.level = WarnLevel
case "error", "Error", "ERROR":
l.level = ErrorLevel
default:
l.level = InfoLevel
}
}
func (l *Log) SetPrefix(s, level string) string {
return level + " " + s
}
func (l *Log) SetRotate(logDir string) error {
fs := syscall.Statfs_t{}
if err := syscall.Statfs(logDir, &fs); err != nil {
return fmt.Errorf("[InitLog] stats disk space: %s", err.Error())
}
var minRatio float64
if float64(fs.Bavail*uint64(fs.Bsize)) < float64(fs.Blocks*uint64(fs.Bsize))*DefaultHeadRatio {
minRatio = float64(fs.Bavail*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
} else {
minRatio = float64(fs.Blocks*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
}
l.headRoomMB = int64(math.Min(minRatio, DefaultHeadRoom))
minRollingSize := int64(fs.Bavail*uint64(fs.Bsize)/4) / 1024 / 1024 // because 4 log levels
if minRollingSize < DefaultMinRollingSize {
minRollingSize = DefaultMinRollingSize
}
l.rollingSizeMB = int64(math.Min(float64(minRollingSize), float64(DefaultRollingSize)))
return nil
}
func (l *Log) IsEnableDebug() bool {
return l.level <= DebugLevel
}
func (l *Log) IsEnableInfo() bool {
return l.level <= InfoLevel
}
func (l *Log) IsEnableWarn() bool {
return l.level <= WarnLevel
}
func (l *Log) IsEnableError() bool {
return l.level <= ErrorLevel
}
func (l *Log) IsEnableTrace() bool {
return l.level <= TraceLevel
}
func (l *Log) Output(calldepth int, s string, sync bool) {
now := time.Now()
var file string
var line int
var ok bool
if l.flag&(Lshortfile|Llongfile) != 0 {
_, file, line, ok = runtime.Caller(calldepth)
if !ok {
file = "???"
line = 0
}
}
if sync {
l.printMsg(s, file, line, now)
} else {
l.putMsg(s, file, line, now)
}
}
func (l *Log) putMsg(msg string, file string, line int, now time.Time) {
l.entityCh <- &entity{msg: msg, file: file, line: line, now: now}
}
func (l *Log) loopMsg() {
for entity := range l.entityCh {
l.printMsg(entity.msg, entity.file, entity.line, entity.now)
}
}
func (l *Log) printMsg(msg string, file string, line int, now time.Time) {
switch l.level {
case TraceLevel:
switch msg[1] {
case 'I', 'W', 'E', 'F':
l.debug.output(msg, file, line, now)
}
case DebugLevel:
switch msg[1] {
case 'I', 'W', 'E', 'F':
l.debug.output(msg, file, line, now)
}
case InfoLevel:
switch msg[1] {
case 'W', 'E', 'F':
l.info.output(msg, file, line, now)
}
case WarnLevel:
switch msg[1] {
case 'E', 'F':
l.warn.output(msg, file, line, now)
}
}
switch msg[1] {
case 'T':
l.debug.output(msg, file, line, now)
case 'D':
l.debug.output(msg, file, line, now)
case 'I':
l.info.output(msg, file, line, now)
case 'W':
l.warn.output(msg, file, line, now)
case 'E':
l.err.output(msg, file, line, now)
case 'F':
l.err.output(msg, file, line, now)
}
}
func (l *Log) checkLogRotation(logDir, module string) {
// handle panic
defer func() {
if r := recover(); r != nil {
fmt.Printf("[Util.Logger]Check logger rotation panic: [%s]\r\n", r)
}
}()
for {
yesterday := time.Now().AddDate(0, 0, -1)
_, err := os.Stat(logDir + "/" + module + errLogFileName + "." + yesterday.Format(LogFileNameDateFormat))
if err == nil || time.Now().Day() == l.startTime.Day() {
l.debug.checkRollingSize(logDir, debugLogFileName, module, l.rollingSizeMB)
l.info.checkRollingSize(logDir, infoLogFileName, module, l.rollingSizeMB)
l.warn.checkRollingSize(logDir, warnLogFileName, module, l.rollingSizeMB)
l.err.checkRollingSize(logDir, errLogFileName, module, l.rollingSizeMB)
time.Sleep(time.Second * 600)
continue
}
//rotate the log files
l.debug.rotateFile(logDir, debugLogFileName, module, true)
l.info.rotateFile(logDir, infoLogFileName, module, true)
l.warn.rotateFile(logDir, warnLogFileName, module, true)
l.err.rotateFile(logDir, errLogFileName, module, true)
l.startTime = time.Now()
time.Sleep(time.Second * 600)
}
}
func (l *Log) checkCleanLog(logDir, module string) {
// handle panic
defer func() {
if r := recover(); r != nil {
fmt.Printf("[Util.Logger]Check clean logger file panic: [%s]\r\n", r)
}
}()
for {
// check disk space
fs := syscall.Statfs_t{}
if err := syscall.Statfs(logDir, &fs); err != nil {
fmt.Printf("[Util.Logger]Check disk space of dir[%s] err: [%s]\r\n", logDir, err)
time.Sleep(time.Second * 600)
continue
}
diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
diskSpaceLeft -= l.headRoomMB * 1024 * 1024
fInfos, err := ioutil.ReadDir(logDir)
if err != nil || len(fInfos) == 0 {
time.Sleep(time.Second * 600)
continue
}
var needDelFiles RolledFile
for _, info := range fInfos {
if deleteFileFilter(module, info, diskSpaceLeft) {
needDelFiles = append(needDelFiles, info)
}
}
sort.Sort(needDelFiles)
for _, info := range needDelFiles {
if err = os.Remove(path.Join(logDir, info.Name())); err != nil {
fmt.Printf("[Util.Logger]Remove logger file[%s] err: [%s]\r\n", info.Name(), err)
continue
}
diskSpaceLeft += info.Size()
if diskSpaceLeft > 0 && time.Since(info.ModTime()) < LogMaxReservedDays {
break
}
}
time.Sleep(time.Second * 600)
}
}
func deleteFileFilter(module string, info os.FileInfo, diskSpaceLeft int64) bool {
if diskSpaceLeft <= 0 {
return info.Mode().IsRegular() && isExpiredRaftLog(module, info.Name())
}
return time.Since(info.ModTime()) > LogMaxReservedDays && isExpiredRaftLog(module, info.Name())
}
func isExpiredRaftLog(module, name string) bool {
if strings.HasSuffix(name, ".log") {
return false
}
if strings.HasPrefix(name, module+infoLogFileName) || strings.HasPrefix(name, module+debugLogFileName) ||
strings.HasPrefix(name, module+warnLogFileName) || strings.HasPrefix(name, module+errLogFileName) {
return true
}
return false
}
func (l *Log) Debug(format string, v ...interface{}) {
if l.IsEnableDebug() {
l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[DebugLevel]), false)
}
}
func (l *Log) Info(format string, v ...interface{}) {
if l.IsEnableInfo() {
l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[InfoLevel]), false)
}
}
func (l *Log) Warn(format string, v ...interface{}) {
if l.IsEnableWarn() {
l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[WarnLevel]), false)
}
}
func (l *Log) Error(format string, v ...interface{}) {
l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[ErrorLevel]), false)
}
func (l *Log) Fatal(format string, v ...interface{}) {
l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[FatalLevel]), true)
os.Exit(1)
}
func (l *Log) Panic(format string, v ...interface{}) {
s := fmt.Sprintf(format+"\r\n", v...)
l.Output(3, l.SetPrefix(s, levels[FatalLevel]), true)
panic(s)
}
func Debug(format string, v ...interface{}) {
glog.Debug(format, v...)
}
func Info(format string, v ...interface{}) {
glog.Info(format, v...)
}
func Warn(format string, v ...interface{}) {
glog.Warn(format, v...)
}
func Error(format string, v ...interface{}) {
glog.Error(format, v...)
}
func Fatal(format string, v ...interface{}) {
glog.Fatal(format, v...)
}
func Panic(format string, v ...interface{}) {
glog.Panic(format, v...)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log
import (
"os"
"syscall"
)
func logCrash(f *os.File) error {
return syscall.Dup3(int(f.Fd()), 2, 0)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"fmt"
"runtime"
"runtime/debug"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
)
func HandleCrash(handlers ...func(interface{})) {
if r := recover(); r != nil {
debug.PrintStack()
logPanic(r)
for _, fn := range handlers {
fn(r)
}
}
}
func logPanic(r interface{}) {
callers := ""
for i := 0; true; i++ {
_, file, line, ok := runtime.Caller(i)
if !ok {
break
}
callers = callers + fmt.Sprintf("%v:%v\n", file, line)
}
logger.Error("Recovered from panic: %#v (%v)\n%v", r, r, callers)
}
func RunWorker(f func(), handlers ...func(interface{})) {
go func() {
defer HandleCrash(handlers...)
f()
}()
}
func RunWorkerUtilStop(f func(), stopCh <-chan struct{}, handlers ...func(interface{})) {
go func() {
for {
select {
case <-stopCh:
return
default:
func() {
defer HandleCrash(handlers...)
f()
}()
}
}
}()
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
import (
"time"
)
const (
_ = iota
KB = 1 << (10 * iota)
MB
GB
)
const time_format = "2006-01-02 15:04:05.000"
type Uint64Slice []uint64
func (p Uint64Slice) Len() int { return len(p) }
func (p Uint64Slice) Less(i, j int) bool { return p[i] < p[j] }
func (p Uint64Slice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func Min(a, b uint64) uint64 {
if a > b {
return b
}
return a
}
func Max(a, b uint64) uint64 {
if a > b {
return a
}
return b
}
func FormatDate(t time.Time) string {
return t.Format(time_format)
}
func FormatTimestamp(t int64) string {
if t <= 0 {
return ""
}
return time.Unix(0, t).Format(time_format)
}
// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package util
func Uvarint64(buf []byte) (uint64, int) {
if buf[0] <= 0xF0 {
return uint64(buf[0]), 1
}
if buf[0] <= 0xF8 {
return 240 + 256*(uint64(buf[0])-241) + uint64(buf[1]), 2
}
if buf[0] == 0xF9 {
return 2288 + 256*uint64(buf[1]) + uint64(buf[2]), 3
}
if buf[0] == 0xFA {
return uint64(buf[1])<<16 | uint64(buf[2])<<8 | uint64(buf[3]), 4
}
if buf[0] == 0xFB {
return uint64(buf[1])<<24 | uint64(buf[2])<<16 | uint64(buf[3])<<8 | uint64(buf[4]), 5
}
if buf[0] == 0xFC {
return uint64(buf[1])<<32 | uint64(buf[2])<<24 | uint64(buf[3])<<16 | uint64(buf[4])<<8 | uint64(buf[5]), 6
}
if buf[0] == 0xFD {
return uint64(buf[1])<<40 | uint64(buf[2])<<32 | uint64(buf[3])<<24 | uint64(buf[4])<<16 | uint64(buf[5])<<8 | uint64(buf[6]), 7
}
if buf[0] == 0xFE {
return uint64(buf[1])<<48 | uint64(buf[2])<<40 | uint64(buf[3])<<32 | uint64(buf[4])<<24 | uint64(buf[5])<<16 | uint64(buf[6])<<8 | uint64(buf[7]), 8
}
return uint64(buf[1])<<56 | uint64(buf[2])<<48 | uint64(buf[3])<<40 | uint64(buf[4])<<32 | uint64(buf[5])<<24 | uint64(buf[6])<<16 | uint64(buf[7])<<8 | uint64(buf[8]), 9
}
func PutUvarint64(buf []byte, x uint64) int {
if x < 241 {
buf[0] = byte(x)
return 1
}
if x < 2288 {
buf[0] = byte((x-240)/256 + 241)
buf[1] = byte((x - 240) % 256)
return 2
}
if x < 67824 {
buf[0] = 0xF9
buf[1] = byte((x - 2288) / 256)
buf[2] = byte((x - 2288) % 256)
return 3
}
if x < 1<<24 {
buf[0] = 0xFA
buf[1] = byte(x >> 16)
buf[2] = byte(x >> 8)
buf[3] = byte(x)
return 4
}
if x < 1<<32 {
buf[0] = 0xFB
buf[1] = byte(x >> 24)
buf[2] = byte(x >> 16)
buf[3] = byte(x >> 8)
buf[4] = byte(x)
return 5
}
if x < 1<<40 {
buf[0] = 0xFC
buf[1] = byte(x >> 32)
buf[2] = byte(x >> 24)
buf[3] = byte(x >> 16)
buf[4] = byte(x >> 8)
buf[5] = byte(x)
return 6
}
if x < 1<<48 {
buf[0] = 0xFD
buf[1] = byte(x >> 40)
buf[2] = byte(x >> 32)
buf[3] = byte(x >> 24)
buf[4] = byte(x >> 16)
buf[5] = byte(x >> 8)
buf[6] = byte(x)
return 7
}
if x < 1<<56 {
buf[0] = 0xFE
buf[1] = byte(x >> 48)
buf[2] = byte(x >> 40)
buf[3] = byte(x >> 32)
buf[4] = byte(x >> 24)
buf[5] = byte(x >> 16)
buf[6] = byte(x >> 8)
buf[7] = byte(x)
return 8
}
buf[0] = 0xFF
buf[1] = byte(x >> 56)
buf[2] = byte(x >> 48)
buf[3] = byte(x >> 40)
buf[4] = byte(x >> 32)
buf[5] = byte(x >> 24)
buf[6] = byte(x >> 16)
buf[7] = byte(x >> 8)
buf[8] = byte(x)
return 9
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"net"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// const
const (
// the maximum number of tasks that can be handled each time
MaxTaskNum = 30
TaskWorkerInterval = time.Second * time.Duration(2)
idleConnTimeout = 90 // seconds
connectTimeout = 10 // seconds
)
// AdminTaskManager sends administration commands to the metaNode or dataNode.
type AdminTaskManager struct {
clusterID string
targetAddr string
TaskMap map[string]*proto.AdminTask
sync.RWMutex
exitCh chan struct{}
connPool *util.ConnectPool
}
func newAdminTaskManager(targetAddr, clusterID string) (sender *AdminTaskManager) {
proto.InitBufferPool(int64(32768))
sender = &AdminTaskManager{
targetAddr: targetAddr,
clusterID: clusterID,
TaskMap: make(map[string]*proto.AdminTask),
exitCh: make(chan struct{}, 1),
connPool: util.NewConnectPoolWithTimeout(idleConnTimeout, connectTimeout),
}
go sender.process()
return
}
func (sender *AdminTaskManager) process() {
ticker := time.NewTicker(TaskWorkerInterval)
defer func() {
ticker.Stop()
Warn(sender.clusterID, fmt.Sprintf("clusterID[%v] %v sender stop", sender.clusterID, sender.targetAddr))
}()
for {
select {
case <-sender.exitCh:
return
case <-ticker.C:
sender.doDeleteTasks()
sender.doSendTasks()
}
}
}
func (sender *AdminTaskManager) doDeleteTasks() {
delTasks := sender.getToBeDeletedTasks()
for _, t := range delTasks {
sender.DelTask(t)
}
return
}
func (sender *AdminTaskManager) getToBeDeletedTasks() (delTasks []*proto.AdminTask) {
sender.RLock()
defer sender.RUnlock()
delTasks = make([]*proto.AdminTask, 0)
for _, task := range sender.TaskMap {
if task.CheckTaskTimeOut() {
log.LogWarnf(fmt.Sprintf("clusterID[%v] %v has no response until time out",
sender.clusterID, task.ID))
if task.SendTime > 0 {
Warn(sender.clusterID, fmt.Sprintf("clusterID[%v] %v has no response until time out",
sender.clusterID, task.ID))
}
// timed-out tasks will be deleted
delTasks = append(delTasks, task)
}
}
return
}
func (sender *AdminTaskManager) doSendTasks() {
tasks := sender.getToDoTasks()
if len(tasks) == 0 {
return
}
sender.sendTasks(tasks)
}
func (sender *AdminTaskManager) getConn() (conn *net.TCPConn, err error) {
if useConnPool {
return sender.connPool.GetConnect(sender.targetAddr)
}
var connect net.Conn
connect, err = net.Dial("tcp", sender.targetAddr)
if err == nil {
conn = connect.(*net.TCPConn)
conn.SetKeepAlive(true)
conn.SetNoDelay(true)
}
return
}
func (sender *AdminTaskManager) putConn(conn *net.TCPConn, forceClose bool) {
if useConnPool {
sender.connPool.PutConnect(conn, forceClose)
}
}
func (sender *AdminTaskManager) sendTasks(tasks []*proto.AdminTask) {
for _, task := range tasks {
if task.OpCode == proto.OpVersionOperation {
log.LogInfof("action[sendTasks] get task to addr [%v]", task.OperatorAddr)
}
conn, err := sender.getConn()
if err != nil {
msg := fmt.Sprintf("clusterID[%v] get connection to %v,err,%v", sender.clusterID, sender.targetAddr, errors.Stack(err))
WarnBySpecialKey(fmt.Sprintf("%v_%v_sendTask", sender.clusterID, ModuleName), msg)
sender.putConn(conn, true)
sender.updateTaskInfo(task, false)
break
}
if err = sender.sendAdminTask(task, conn); err != nil {
log.LogError(fmt.Sprintf("send task %v to %v err %v,errStack,%v", task.ID, sender.targetAddr, err, errors.Stack(err)))
sender.putConn(conn, true)
sender.updateTaskInfo(task, true)
continue
}
sender.putConn(conn, false)
}
}
func (sender *AdminTaskManager) updateTaskInfo(task *proto.AdminTask, connSuccess bool) {
task.SendCount++
if connSuccess {
task.SendTime = time.Now().Unix()
task.Status = proto.TaskRunning
}
}
func (sender *AdminTaskManager) buildPacket(task *proto.AdminTask) (packet *proto.Packet, err error) {
packet = proto.NewPacket()
packet.Opcode = task.OpCode
packet.ReqID = proto.GenerateRequestID()
packet.PartitionID = task.PartitionID
body, err := json.Marshal(task)
if err != nil {
return nil, err
}
packet.Size = uint32(len(body))
packet.Data = body
return packet, nil
}
func (sender *AdminTaskManager) sendAdminTask(task *proto.AdminTask, conn net.Conn) (err error) {
packet, err := sender.buildPacket(task)
if err != nil {
return errors.Trace(err, "action[sendAdminTask build packet failed,task:%v]", task.ID)
}
if err = packet.WriteToConn(conn); err != nil {
return errors.Trace(err, "action[sendAdminTask],WriteToConn failed,task:%v", task.ID)
}
if err = packet.ReadFromConnWithVer(conn, proto.ReadDeadlineTime); err != nil {
return errors.Trace(err, "action[sendAdminTask],ReadFromConn failed task:%v", task.ID)
}
log.LogDebugf(fmt.Sprintf("action[sendAdminTask] sender task:%v success", task.ToString()))
sender.updateTaskInfo(task, true)
return nil
}
func (sender *AdminTaskManager) syncSendAdminTask(task *proto.AdminTask) (packet *proto.Packet, err error) {
packet, err = sender.buildPacket(task)
if err != nil {
return nil, errors.Trace(err, "action[syncSendAdminTask build packet failed,task:%v]", task.ID)
}
log.LogInfof("action[syncSendAdminTask],task[%s], op %s, reqId %d", task.ToString(), packet.GetOpMsg(), packet.GetReqID())
conn, err := sender.getConn()
if err != nil {
return nil, errors.Trace(err, "action[syncSendAdminTask get conn failed,task:%v]", task.ID)
}
defer func() {
if err == nil {
sender.putConn(conn, false)
} else {
sender.putConn(conn, true)
}
}()
if err = packet.WriteToConn(conn); err != nil {
return nil, errors.Trace(err, "action[syncSendAdminTask],WriteToConn failed,task:%v,reqID[%v]", task.ID, packet.ReqID)
}
if err = packet.ReadFromConnWithVer(conn, proto.SyncSendTaskDeadlineTime); err != nil {
return nil, errors.Trace(err, "action[syncSendAdminTask],ReadFromConn failed task:%v,reqID[%v]", task.ID, packet.ReqID)
}
if packet.ResultCode != proto.OpOk {
err = fmt.Errorf("result code[%v],msg[%v]", packet.ResultCode, string(packet.Data))
log.LogErrorf("action[syncSendAdminTask],task:%v,reqID[%v],err[%v],", task.ID, packet.ReqID, err)
return
}
return packet, nil
}
// DelTask deletes the to-be-deleted tasks.
func (sender *AdminTaskManager) DelTask(t *proto.AdminTask) {
sender.Lock()
defer sender.Unlock()
_, ok := sender.TaskMap[t.ID]
if !ok {
return
}
if t.OpCode != proto.OpMetaNodeHeartbeat && t.OpCode != proto.OpDataNodeHeartbeat && t.OpCode != proto.OpLcNodeHeartbeat {
log.LogDebugf("action[DelTask] delete task[%v]", t.ToString())
}
delete(sender.TaskMap, t.ID)
}
// AddTask adds a new task to the task map.
func (sender *AdminTaskManager) AddTask(t *proto.AdminTask) {
sender.Lock()
defer sender.Unlock()
_, ok := sender.TaskMap[t.ID]
if !ok {
sender.TaskMap[t.ID] = t
}
}
func (sender *AdminTaskManager) getToDoTasks() (tasks []*proto.AdminTask) {
sender.RLock()
defer sender.RUnlock()
tasks = make([]*proto.AdminTask, 0)
// send heartbeat task first
for _, t := range sender.TaskMap {
if t.IsHeartbeatTask() && t.CheckTaskNeedSend() == true {
tasks = append(tasks, t)
t.SendTime = time.Now().Unix()
}
}
// send urgent task immediately
for _, t := range sender.TaskMap {
if t.IsUrgentTask() && t.CheckTaskNeedSend() == true {
tasks = append(tasks, t)
t.SendTime = time.Now().Unix()
}
}
for _, task := range sender.TaskMap {
if !task.IsHeartbeatTask() && !task.IsUrgentTask() && task.CheckTaskNeedSend() {
tasks = append(tasks, task)
task.SendTime = time.Now().Unix()
if task.OpCode == proto.OpVersionOperation {
log.LogInfof("action[getToDoTasks] get task to addr [%v]", task.OperatorAddr)
continue
}
}
if len(tasks) > MaxTaskNum {
break
}
}
return
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"math"
"net/http"
"strconv"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/compressor"
"github.com/cubefs/cubefs/util/cryptoutil"
"github.com/cubefs/cubefs/util/log"
)
// Parse the request that adds/deletes a raft node.
func parseRequestForRaftNode(r *http.Request) (id uint64, host string, err error) {
if err = r.ParseForm(); err != nil {
return
}
var idStr string
if idStr = r.FormValue(idKey); idStr == "" {
err = keyNotFound(idKey)
return
}
if id, err = strconv.ParseUint(idStr, 10, 64); err != nil {
return
}
if host = r.FormValue(addrKey); host == "" {
err = keyNotFound(addrKey)
return
}
if arr := strings.Split(host, colonSplit); len(arr) < 2 {
err = unmatchedKey(addrKey)
return
}
return
}
func extractTxTimeout(r *http.Request) (timeout int64, err error) {
var txTimeout uint64
if txTimeout, err = extractUint64WithDefault(r, txTimeoutKey, proto.DefaultTransactionTimeout); err != nil {
return
}
if txTimeout == 0 || txTimeout > proto.MaxTransactionTimeout {
return timeout, fmt.Errorf("txTimeout(%d) value range [1-%v] minutes", txTimeout, proto.MaxTransactionTimeout)
}
timeout = int64(txTimeout)
return timeout, nil
}
func extractTxConflictRetryNum(r *http.Request) (retryNum int64, err error) {
var txRetryNum uint64
if txRetryNum, err = extractUint64WithDefault(r, txConflictRetryNumKey, proto.DefaultTxConflictRetryNum); err != nil {
return
}
if txRetryNum == 0 || txRetryNum > proto.MaxTxConflictRetryNum {
return retryNum, fmt.Errorf("txRetryNum(%d) value range [1-%v]", txRetryNum, proto.MaxTxConflictRetryNum)
}
retryNum = int64(txRetryNum)
return retryNum, nil
}
func extractTxConflictRetryInterval(r *http.Request) (interval int64, err error) {
var txInterval uint64
if txInterval, err = extractUint64WithDefault(r, txConflictRetryIntervalKey, proto.DefaultTxConflictRetryInterval); err != nil {
return
}
if txInterval < proto.MinTxConflictRetryInterval || txInterval > proto.MaxTxConflictRetryInterval {
return interval, fmt.Errorf("txInterval(%d) value range [%v-%v] ms",
txInterval, proto.MinTxConflictRetryInterval, proto.MaxTxConflictRetryInterval)
}
interval = int64(txInterval)
return interval, nil
}
func extractTxOpLimitInterval(r *http.Request, volLimit int) (limit int, err error) {
var txLimit int
if txLimit, err = extractUintWithDefault(r, txOpLimitKey, volLimit); err != nil {
return
}
limit = txLimit
return
}
func hasTxParams(r *http.Request) bool {
var (
maskStr string
timeoutStr string
)
if maskStr = r.FormValue(enableTxMaskKey); maskStr != "" {
return true
}
if timeoutStr = r.FormValue(txTimeoutKey); timeoutStr != "" {
return true
}
return false
}
func parseTxMask(r *http.Request, oldMask proto.TxOpMask) (mask proto.TxOpMask, err error) {
var maskStr string
if maskStr = r.FormValue(enableTxMaskKey); maskStr == "" {
mask = oldMask
return
}
var reset bool
reset, err = extractBoolWithDefault(r, txForceResetKey, false)
if err != nil {
return
}
mask, err = proto.GetMaskFromString(maskStr)
if err != nil {
return
}
if reset {
return
}
if mask != proto.TxOpMaskOff {
mask = mask | oldMask
}
return
}
func parseRequestForUpdateMetaNode(r *http.Request) (nodeAddr string, id uint64, err error) {
if err = r.ParseForm(); err != nil {
return
}
if nodeAddr, err = extractNodeAddr(r); err != nil {
return
}
if id, err = extractNodeID(r); err != nil {
return
}
return
}
func parseRequestForAddNode(r *http.Request) (nodeAddr, zoneName string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if nodeAddr, err = extractNodeAddr(r); err != nil {
return
}
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
zoneName = DefaultZoneName
}
return
}
func parseDecomNodeReq(r *http.Request) (nodeAddr string, limit int, err error) {
nodeAddr, err = parseAndExtractNodeAddr(r)
if err != nil {
return
}
limit, err = parseUintParam(r, countKey)
if err != nil {
return
}
return
}
func parseDecomDataNodeReq(r *http.Request) (nodeAddr string, err error) {
nodeAddr, err = parseAndExtractNodeAddr(r)
if err != nil {
return
}
return
}
func parseAndExtractNodeAddr(r *http.Request) (nodeAddr string, err error) {
if err = r.ParseForm(); err != nil {
return
}
return extractNodeAddr(r)
}
func parseRequestToDecommissionNode(r *http.Request) (nodeAddr, diskPath string, err error) {
if err = r.ParseForm(); err != nil {
return
}
nodeAddr, err = extractNodeAddr(r)
if err != nil {
return
}
diskPath, err = extractDiskPath(r)
return
}
func parseRequestToGetTaskResponse(r *http.Request) (tr *proto.AdminTask, err error) {
var body []byte
if err = r.ParseForm(); err != nil {
return
}
if body, err = io.ReadAll(r.Body); err != nil {
return
}
tr = &proto.AdminTask{}
decoder := json.NewDecoder(bytes.NewBuffer([]byte(body)))
decoder.UseNumber()
err = decoder.Decode(tr)
return
}
func parseVolName(r *http.Request) (name string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
return
}
return
}
func parseVolVerStrategy(r *http.Request) (strategy proto.VolumeVerStrategy, isForce bool, err error) {
var value string
if value = r.FormValue(enableKey); value == "" {
strategy.Enable = true
} else {
if strategy.Enable, err = strconv.ParseBool(value); err != nil {
log.LogErrorf("parseVolVerStrategy. strategy.Enable %v strategy %v", strategy.Enable, strategy)
return
}
}
strategy.KeepVerCnt, err = parseUintParam(r, countKey)
if strategy.Enable && err != nil {
log.LogErrorf("parseVolVerStrategy. strategy.Enable %v strategy %v", strategy.Enable, strategy)
return
}
strategy.Periodic, err = parseUintParam(r, Periodic)
if strategy.Enable && err != nil {
log.LogErrorf("parseVolVerStrategy. strategy.Enable %v strategy %v", strategy.Enable, strategy)
return
}
if value = r.FormValue(forceKey); value != "" {
isForce = true
strategy.ForceUpdate, _ = strconv.ParseBool(value)
}
log.LogDebugf("parseVolVerStrategy. strategy %v", strategy)
return
}
func parseGetVolParameter(r *http.Request) (p *getVolParameter, err error) {
p = &getVolParameter{}
skipOwnerValidationVal := r.Header.Get(proto.SkipOwnerValidation)
if len(skipOwnerValidationVal) > 0 {
if p.skipOwnerValidation, err = strconv.ParseBool(skipOwnerValidationVal); err != nil {
return
}
}
if p.name = r.FormValue(nameKey); p.name == "" {
err = keyNotFound(nameKey)
return
}
if !volNameRegexp.MatchString(p.name) {
err = errors.New("name can only be number and letters")
return
}
if p.authKey = r.FormValue(volAuthKey); !p.skipOwnerValidation && len(p.authKey) == 0 {
err = keyNotFound(volAuthKey)
return
}
return
}
func parseRequestToDeleteVol(r *http.Request) (name, authKey string, force bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
return
}
if authKey, err = extractAuthKey(r); err != nil {
return
}
force, err = extractBoolWithDefault(r, forceDelVolKey, false)
if err != nil {
return
}
return
}
func extractUintWithDefault(r *http.Request, key string, def int) (val int, err error) {
var str string
if str = r.FormValue(key); str == "" {
return def, nil
}
if val, err = strconv.Atoi(str); err != nil || val < 0 {
return 0, fmt.Errorf("parse [%s] is not valid int [%d], err %v", key, val, err)
}
return val, nil
}
func extractUint64WithDefault(r *http.Request, key string, def uint64) (val uint64, err error) {
var str string
if str = r.FormValue(key); str == "" {
return def, nil
}
if val, err = strconv.ParseUint(str, 10, 64); err != nil || val < 0 {
return 0, fmt.Errorf("parse [%s] is not valid uint [%d], err %v", key, val, err)
}
return val, nil
}
func extractInt64WithDefault(r *http.Request, key string, def int64) (val int64, err error) {
var str string
if str = r.FormValue(key); str == "" {
return def, nil
}
if val, err = strconv.ParseInt(str, 10, 64); err != nil || val < 0 {
return 0, fmt.Errorf("parse [%s] is not valid int [%d], err %v", key, val, err)
}
return val, nil
}
func extractStrWithDefault(r *http.Request, key string, def string) (val string) {
if val = r.FormValue(key); val == "" {
return def
}
return val
}
func extractBoolWithDefault(r *http.Request, key string, def bool) (val bool, err error) {
var str string
if str = r.FormValue(key); str == "" {
return def, nil
}
if val, err = strconv.ParseBool(str); err != nil {
return false, fmt.Errorf("parse [%s] is not a bool val [%t]", key, val)
}
return val, nil
}
type updateVolReq struct {
name string
authKey string
capacity uint64
deleteLockTime int64
followerRead bool
authenticate bool
enablePosixAcl bool
enableTransaction proto.TxOpMask
txTimeout int64
txConflictRetryNum int64
txConflictRetryInterval int64
txOpLimit int
zoneName string
description string
dpSelectorName string
dpSelectorParm string
replicaNum int
coldArgs *coldVolArgs
dpReadOnlyWhenVolFull bool
enableQuota bool
}
func parseColdVolUpdateArgs(r *http.Request, vol *Vol) (args *coldVolArgs, err error) {
args = &coldVolArgs{}
if args.objBlockSize, err = extractUintWithDefault(r, ebsBlkSizeKey, vol.EbsBlkSize); err != nil {
return
}
if args.cacheCap, err = extractUint64WithDefault(r, cacheCapacity, vol.CacheCapacity); err != nil {
return
}
if args.cacheAction, err = extractUintWithDefault(r, cacheActionKey, vol.CacheAction); err != nil {
return
}
if args.cacheThreshold, err = extractUintWithDefault(r, cacheThresholdKey, vol.CacheThreshold); err != nil {
return
}
if args.cacheTtl, err = extractUintWithDefault(r, cacheTTLKey, vol.CacheTTL); err != nil {
return
}
if args.cacheHighWater, err = extractUintWithDefault(r, cacheHighWaterKey, vol.CacheHighWater); err != nil {
return
}
if args.cacheLowWater, err = extractUintWithDefault(r, cacheLowWaterKey, vol.CacheLowWater); err != nil {
return
}
if args.cacheLRUInterval, err = extractUintWithDefault(r, cacheLRUIntervalKey, vol.CacheLRUInterval); err != nil {
return
}
if args.cacheLRUInterval < 2 {
return nil, fmt.Errorf("cacheLruInterval(%d) muster be bigger than 2 minute", args.cacheLRUInterval)
}
args.cacheRule = extractStrWithDefault(r, cacheRuleKey, vol.CacheRule)
emptyCacheRule, err := extractBoolWithDefault(r, emptyCacheRuleKey, false)
if err != nil {
return
}
if emptyCacheRule {
args.cacheRule = ""
}
// do some check
if args.cacheLowWater >= args.cacheHighWater {
return nil, fmt.Errorf("low water(%d) must be less than high water(%d)", args.cacheLowWater, args.cacheHighWater)
}
if args.cacheHighWater >= 90 || args.cacheLowWater >= 90 {
return nil, fmt.Errorf("low(%d) or high water(%d) can't be large than 90, low than 0", args.cacheLowWater, args.cacheHighWater)
}
if args.cacheAction < proto.NoCache || args.cacheAction > proto.RWCache {
return nil, fmt.Errorf("cache action is illegal (%d)", args.cacheAction)
}
return
}
func parseVolUpdateReq(r *http.Request, vol *Vol, req *updateVolReq) (err error) {
if err = r.ParseForm(); err != nil {
return
}
req.authKey = extractStr(r, volAuthKey)
req.description = extractStrWithDefault(r, descriptionKey, vol.description)
req.zoneName = extractStrWithDefault(r, zoneNameKey, vol.zoneName)
if req.capacity, err = extractUint64WithDefault(r, volCapacityKey, vol.Capacity); err != nil {
return
}
if req.deleteLockTime, err = extractInt64WithDefault(r, volDeleteLockTimeKey, vol.DeleteLockTime); err != nil {
return
}
if req.enablePosixAcl, err = extractBoolWithDefault(r, enablePosixAclKey, vol.enablePosixAcl); err != nil {
return
}
var txMask proto.TxOpMask
if txMask, err = parseTxMask(r, vol.enableTransaction); err != nil {
return
}
req.enableTransaction = txMask
if req.enableQuota, err = extractBoolWithDefault(r, enableQuota, vol.enableQuota); err != nil {
return
}
var txTimeout int64
if txTimeout, err = extractTxTimeout(r); err != nil {
return
}
req.txTimeout = txTimeout
var txConflictRetryNum int64
if txConflictRetryNum, err = extractTxConflictRetryNum(r); err != nil {
return
}
req.txConflictRetryNum = txConflictRetryNum
var txConflictRetryInterval int64
if txConflictRetryInterval, err = extractTxConflictRetryInterval(r); err != nil {
return
}
req.txConflictRetryInterval = txConflictRetryInterval
if req.txOpLimit, err = extractTxOpLimitInterval(r, vol.txOpLimit); err != nil {
return
}
if req.authenticate, err = extractBoolWithDefault(r, authenticateKey, vol.authenticate); err != nil {
return
}
if req.followerRead, err = extractBoolWithDefault(r, followerReadKey, vol.FollowerRead); err != nil {
return
}
if req.dpReadOnlyWhenVolFull, err = extractBoolWithDefault(r, dpReadOnlyWhenVolFull, vol.DpReadOnlyWhenVolFull); err != nil {
return
}
req.dpSelectorName = r.FormValue(dpSelectorNameKey)
req.dpSelectorParm = r.FormValue(dpSelectorParmKey)
if (req.dpSelectorName == "" && req.dpSelectorParm != "") || (req.dpSelectorName != "" && req.dpSelectorParm == "") {
err = keyNotFound(dpSelectorNameKey + " or " + dpSelectorParmKey)
return
} else if req.dpSelectorParm == "" && req.dpSelectorName == "" {
req.dpSelectorName = vol.dpSelectorName
req.dpSelectorParm = vol.dpSelectorParm
}
if proto.IsCold(vol.VolType) {
req.followerRead = true
req.coldArgs, err = parseColdVolUpdateArgs(r, vol)
if err != nil {
return
}
}
return
}
func parseBoolFieldToUpdateVol(r *http.Request, vol *Vol) (followerRead, authenticate bool, err error) {
if followerReadStr := r.FormValue(followerReadKey); followerReadStr != "" {
if followerRead, err = strconv.ParseBool(followerReadStr); err != nil {
err = unmatchedKey(followerReadKey)
return
}
} else {
followerRead = vol.FollowerRead
}
if authenticateStr := r.FormValue(authenticateKey); authenticateStr != "" {
if authenticate, err = strconv.ParseBool(authenticateStr); err != nil {
err = unmatchedKey(authenticateKey)
return
}
} else {
authenticate = vol.authenticate
}
return
}
func parseRequestToSetApiQpsLimit(r *http.Request) (name string, limit uint32, timeout uint32, err error) {
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
return
}
if limit, err = extractUint32(r, Limit); err != nil {
return
}
if timeout, err = extractUint32(r, TimeOut); err != nil {
return
}
if timeout == 0 {
err = fmt.Errorf("timeout(seconds) args must be larger than 0")
}
return
}
func parseRequestToSetVolCapacity(r *http.Request) (name, authKey string, capacity int, err error) {
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
return
}
if authKey, err = extractAuthKey(r); err != nil {
return
}
if capacity, err = extractUint(r, volCapacityKey); err != nil {
return
}
return
}
type qosArgs struct {
qosEnable bool
diskQosEnable bool
iopsRVal uint64
iopsWVal uint64
flowRVal uint64
flowWVal uint64
}
func (qos *qosArgs) isArgsWork() bool {
return (qos.iopsRVal | qos.iopsWVal | qos.flowRVal | qos.flowWVal) > 0
}
type coldVolArgs struct {
objBlockSize int
cacheCap uint64
cacheAction int
cacheThreshold int
cacheTtl int
cacheHighWater int
cacheLowWater int
cacheLRUInterval int
cacheRule string
}
type createVolReq struct {
name string
owner string
dpSize int
mpCount int
dpCount int
dpReplicaNum uint8
capacity int
deleteLockTime int64
followerRead bool
authenticate bool
crossZone bool
normalZonesFirst bool
domainId uint64
zoneName string
description string
volType int
enablePosixAcl bool
DpReadOnlyWhenVolFull bool
enableTransaction proto.TxOpMask
enableQuota bool
txTimeout int64
txConflictRetryNum int64
txConflictRetryInterval int64
qosLimitArgs *qosArgs
clientReqPeriod, clientHitTriggerCnt uint32
// cold vol args
coldArgs coldVolArgs
}
func checkCacheAction(action int) error {
if action != proto.NoCache && action != proto.RCache && action != proto.RWCache {
return fmt.Errorf("cache action is not legal, action [%d]", action)
}
return nil
}
func parseColdArgs(r *http.Request) (args coldVolArgs, err error) {
args.cacheRule = extractStr(r, cacheRuleKey)
if args.objBlockSize, err = extractUint(r, ebsBlkSizeKey); err != nil {
return
}
if args.cacheCap, err = extractUint64(r, cacheCapacity); err != nil {
return
}
if args.cacheAction, err = extractUint(r, cacheActionKey); err != nil {
return
}
if args.cacheThreshold, err = extractUint(r, cacheThresholdKey); err != nil {
return
}
if args.cacheTtl, err = extractUint(r, cacheTTLKey); err != nil {
return
}
if args.cacheHighWater, err = extractUint(r, cacheHighWaterKey); err != nil {
return
}
if args.cacheLowWater, err = extractUint(r, cacheLowWaterKey); err != nil {
return
}
if args.cacheLRUInterval, err = extractUint(r, cacheLRUIntervalKey); err != nil {
return
}
return
}
func parseRequestToCreateVol(r *http.Request, req *createVolReq) (err error) {
if err = r.ParseForm(); err != nil {
return
}
if req.name, err = extractName(r); err != nil {
return
}
if req.owner, err = extractOwner(r); err != nil {
return
}
if req.coldArgs, err = parseColdArgs(r); err != nil {
return
}
if req.mpCount, err = extractUintWithDefault(r, metaPartitionCountKey, defaultInitMetaPartitionCount); err != nil {
return
}
if req.dpCount, err = extractUintWithDefault(r, dataPartitionCountKey, defaultInitDataPartitionCnt); err != nil {
return
}
var parsedDpReplicaNum int
if parsedDpReplicaNum, err = extractUint(r, replicaNumKey); err != nil {
return
}
if parsedDpReplicaNum < 0 || parsedDpReplicaNum > math.MaxUint8 {
return fmt.Errorf("invalid arg dpReplicaNum: %v", parsedDpReplicaNum)
}
req.dpReplicaNum = uint8(parsedDpReplicaNum)
if req.dpSize, err = extractUintWithDefault(r, dataPartitionSizeKey, 120); err != nil {
return
}
// default capacity 120
if req.capacity, err = extractUint(r, volCapacityKey); err != nil {
return
}
if req.deleteLockTime, err = extractInt64WithDefault(r, volDeleteLockTimeKey, 0); err != nil {
return
}
if req.volType, err = extractUint(r, volTypeKey); err != nil {
return
}
followerRead, followerExist, err := extractFollowerRead(r)
if err != nil {
return
}
if followerExist && followerRead == false && proto.IsHot(req.volType) &&
(req.dpReplicaNum == 1 || req.dpReplicaNum == 2) {
return fmt.Errorf("vol with 1 ro 2 replia should enable followerRead")
}
req.followerRead = followerRead
if proto.IsHot(req.volType) && (req.dpReplicaNum == 1 || req.dpReplicaNum == 2) {
req.followerRead = true
}
if req.authenticate, err = extractBoolWithDefault(r, authenticateKey, false); err != nil {
return
}
if req.crossZone, err = extractBoolWithDefault(r, crossZoneKey, false); err != nil {
return
}
if req.normalZonesFirst, err = extractBoolWithDefault(r, normalZonesFirstKey, false); err != nil {
return
}
if req.qosLimitArgs, err = parseRequestQos(r, false, false); err != nil {
return err
}
req.zoneName = extractStr(r, zoneNameKey)
req.description = extractStr(r, descriptionKey)
req.domainId, err = extractUint64WithDefault(r, domainIdKey, 0)
if err != nil {
return
}
req.enablePosixAcl, err = extractPosixAcl(r)
if req.DpReadOnlyWhenVolFull, err = extractBoolWithDefault(r, dpReadOnlyWhenVolFull, false); err != nil {
return
}
var txMask proto.TxOpMask
if txMask, err = parseTxMask(r, proto.TxOpMaskOff); err != nil {
return
}
req.enableTransaction = txMask
var txTimeout int64
if txTimeout, err = extractTxTimeout(r); err != nil {
return
}
req.txTimeout = txTimeout
var txConflictRetryNum int64
if txConflictRetryNum, err = extractTxConflictRetryNum(r); err != nil {
return
}
req.txConflictRetryNum = txConflictRetryNum
var txConflictRetryInterval int64
if txConflictRetryInterval, err = extractTxConflictRetryInterval(r); err != nil {
return
}
req.txConflictRetryInterval = txConflictRetryInterval
if req.enableQuota, err = extractBoolWithDefault(r, enableQuota, false); err != nil {
return
}
return
}
func parseRequestToCreateDataPartition(r *http.Request) (count int, name string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if countStr := r.FormValue(countKey); countStr == "" {
err = keyNotFound(countKey)
return
} else if count, err = strconv.Atoi(countStr); err != nil || count == 0 {
err = unmatchedKey(countKey)
return
}
if name, err = extractName(r); err != nil {
return
}
return
}
func parseRequestToGetConcurrentLcNode(r *http.Request) (count uint64, err error) {
if err = r.ParseForm(); err != nil {
return
}
if count, err = extractUint64(r, countKey); err != nil || count == 0 {
err = unmatchedKey(countKey)
return
}
return
}
func parseRequestToGetDataPartition(r *http.Request) (ID uint64, volName string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if ID, err = extractDataPartitionID(r); err != nil {
return
}
volName = r.FormValue(nameKey)
return
}
func parseRequestToBalanceMetaPartition(r *http.Request) (zones string, nodeSetIds string, err error) {
if err = r.ParseForm(); err != nil {
return
}
zones = r.FormValue(zoneNameKey)
nodeSetIds = r.FormValue(nodesetIdKey)
return
}
func parseRequestToLoadDataPartition(r *http.Request) (ID uint64, err error) {
if err = r.ParseForm(); err != nil {
return
}
if ID, err = extractDataPartitionID(r); err != nil {
return
}
return
}
func parseRequestToAddMetaReplica(r *http.Request) (ID uint64, addr string, err error) {
return extractMetaPartitionIDAndAddr(r)
}
func parseRequestToRemoveMetaReplica(r *http.Request) (ID uint64, addr string, err error) {
return extractMetaPartitionIDAndAddr(r)
}
func extractMetaPartitionIDAndAddr(r *http.Request) (ID uint64, addr string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if ID, err = extractMetaPartitionID(r); err != nil {
return
}
if addr, err = extractNodeAddr(r); err != nil {
return
}
return
}
func parseRequestToAddDataReplica(r *http.Request) (ID uint64, addr string, err error) {
return extractDataPartitionIDAndAddr(r)
}
func parseRequestToRemoveDataReplica(r *http.Request) (ID uint64, addr string, err error) {
return extractDataPartitionIDAndAddr(r)
}
func extractDataPartitionIDAndAddr(r *http.Request) (ID uint64, addr string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if ID, err = extractDataPartitionID(r); err != nil {
return
}
if addr, err = extractNodeAddr(r); err != nil {
return
}
return
}
func extractDataPartitionID(r *http.Request) (ID uint64, err error) {
var value string
if value = r.FormValue(idKey); value == "" {
err = keyNotFound(idKey)
return
}
return strconv.ParseUint(value, 10, 64)
}
func parseRequestToDecommissionDataPartition(r *http.Request) (ID uint64, nodeAddr string, err error) {
return extractDataPartitionIDAndAddr(r)
}
func extractNodeAddr(r *http.Request) (nodeAddr string, err error) {
if nodeAddr = r.FormValue(addrKey); nodeAddr == "" {
err = keyNotFound(addrKey)
return
}
if ipAddr, ok := util.ParseAddrToIpAddr(nodeAddr); ok {
nodeAddr = ipAddr
}
return
}
func extractNodeID(r *http.Request) (ID uint64, err error) {
var value string
if value = r.FormValue(idKey); value == "" {
err = keyNotFound(idKey)
return
}
return strconv.ParseUint(value, 10, 64)
}
func extractNodesetID(r *http.Request) (ID uint64, err error) {
// nodeset id use same form key with node id
return extractNodeID(r)
}
func extractDiskPath(r *http.Request) (diskPath string, err error) {
if diskPath = r.FormValue(diskPathKey); diskPath == "" {
err = keyNotFound(diskPathKey)
return
}
return
}
func extractDiskDisable(r *http.Request) (diskDisable bool, err error) {
var value string
if value = r.FormValue(DiskDisableKey); value == "" {
diskDisable = true
return
}
return strconv.ParseBool(value)
}
func parseRequestToLoadMetaPartition(r *http.Request) (partitionID uint64, err error) {
if err = r.ParseForm(); err != nil {
return
}
if partitionID, err = extractMetaPartitionID(r); err != nil {
return
}
return
}
func parseRequestToDecommissionMetaPartition(r *http.Request) (partitionID uint64, nodeAddr string, err error) {
return extractMetaPartitionIDAndAddr(r)
}
func parseAndExtractStatus(r *http.Request) (status bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
return extractStatus(r)
}
func parseAndExtractForbidden(r *http.Request) (forbidden bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
return extractForbidden(r)
}
func extractStatus(r *http.Request) (status bool, err error) {
var value string
if value = r.FormValue(enableKey); value == "" {
err = keyNotFound(enableKey)
return
}
if status, err = strconv.ParseBool(value); err != nil {
return
}
return
}
func extractForbidden(r *http.Request) (forbidden bool, err error) {
var value string
if value = r.FormValue(forbiddenKey); value == "" {
err = keyNotFound(forbiddenKey)
return
}
if forbidden, err = strconv.ParseBool(value); err != nil {
return
}
return
}
func extractDataNodesetSelector(r *http.Request) string {
return r.FormValue(dataNodesetSelectorKey)
}
func extractMetaNodesetSelector(r *http.Request) string {
return r.FormValue(metaNodesetSelectorKey)
}
func extractDataNodeSelector(r *http.Request) string {
return r.FormValue(dataNodeSelectorKey)
}
func extractMetaNodeSelector(r *http.Request) string {
return r.FormValue(metaNodeSelectorKey)
}
func extractFollowerRead(r *http.Request) (followerRead bool, exist bool, err error) {
var value string
if value = r.FormValue(followerReadKey); value == "" {
followerRead = false
return
}
exist = true
if followerRead, err = strconv.ParseBool(value); err != nil {
return
}
return
}
func extractAuthenticate(r *http.Request) (authenticate bool, err error) {
var value string
if value = r.FormValue(authenticateKey); value == "" {
authenticate = false
return
}
if authenticate, err = strconv.ParseBool(value); err != nil {
return
}
return
}
func extractCrossZone(r *http.Request) (crossZone bool, err error) {
var value string
if value = r.FormValue(crossZoneKey); value == "" {
crossZone = false
return
}
if crossZone, err = strconv.ParseBool(value); err != nil {
return
}
return
}
func parseAndExtractDirLimit(r *http.Request) (limit uint32, err error) {
if err = r.ParseForm(); err != nil {
return
}
var value string
value = r.FormValue(dirLimitKey)
if value == "" {
value = r.FormValue(dirQuotaKey)
if value == "" {
err = keyNotFound(dirLimitKey)
return
}
}
var tmpLimit uint64
if tmpLimit, err = strconv.ParseUint(value, 10, 32); err != nil {
return
}
limit = uint32(tmpLimit)
return
}
func parseAndExtractThreshold(r *http.Request) (threshold float64, err error) {
if err = r.ParseForm(); err != nil {
return
}
var value string
if value = r.FormValue(thresholdKey); value == "" {
err = keyNotFound(thresholdKey)
return
}
if threshold, err = strconv.ParseFloat(value, 64); err != nil {
return
}
return
}
func parseAndExtractSetNodeSetInfoParams(r *http.Request) (params map[string]interface{}, err error) {
if err = r.ParseForm(); err != nil {
return
}
var value string
params = make(map[string]interface{})
if value = r.FormValue(countKey); value != "" {
count := uint64(0)
count, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(countKey)
return
}
params[countKey] = count
} else {
return nil, fmt.Errorf("not found %v", countKey)
}
var zoneName string
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
zoneName = DefaultZoneName
}
params[zoneNameKey] = zoneName
if value = r.FormValue(idKey); value != "" {
nodesetId := uint64(0)
nodesetId, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(idKey)
err = unmatchedKey(idKey)
return
}
params[idKey] = nodesetId
} else {
return nil, fmt.Errorf("not found %v", idKey)
}
log.LogInfof("action[parseAndExtractSetNodeSetInfoParams]%v,%v,%v", params[zoneNameKey], params[idKey], params[countKey])
return
}
func parseAndExtractSetNodeInfoParams(r *http.Request) (params map[string]interface{}, err error) {
if err = r.ParseForm(); err != nil {
return
}
var value string
noParams := true
params = make(map[string]interface{})
if value = r.FormValue(nodeDeleteBatchCountKey); value != "" {
noParams = false
batchCount := uint64(0)
batchCount, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(nodeDeleteBatchCountKey)
return
}
params[nodeDeleteBatchCountKey] = batchCount
}
if value = r.FormValue(nodeMarkDeleteRateKey); value != "" {
noParams = false
val := uint64(0)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(nodeMarkDeleteRateKey)
return
}
params[nodeMarkDeleteRateKey] = val
}
if value = r.FormValue(nodeAutoRepairRateKey); value != "" {
noParams = false
val := uint64(0)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(nodeAutoRepairRateKey)
return
}
params[nodeAutoRepairRateKey] = val
}
if value = r.FormValue(nodeDeleteWorkerSleepMs); value != "" {
noParams = false
val := uint64(0)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(nodeMarkDeleteRateKey)
return
}
params[nodeDeleteWorkerSleepMs] = val
}
if value = r.FormValue(clusterLoadFactorKey); value != "" {
noParams = false
valF, err := strconv.ParseFloat(value, 64)
if err != nil || valF < 0 {
err = unmatchedKey(clusterLoadFactorKey)
return params, err
}
params[clusterLoadFactorKey] = float32(valF)
}
if value = r.FormValue(maxDpCntLimitKey); value != "" {
noParams = false
val := uint64(0)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(maxDpCntLimitKey)
return
}
params[maxDpCntLimitKey] = val
}
if value = r.FormValue(nodeDpRepairTimeOutKey); value != "" {
noParams = false
val := uint64(0)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(nodeDpRepairTimeOutKey)
return
}
params[nodeDpRepairTimeOutKey] = val
}
if value = r.FormValue(nodeDpMaxRepairErrCntKey); value != "" {
noParams = false
val := uint64(0)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = unmatchedKey(nodeDpMaxRepairErrCntKey)
return
}
params[nodeDpMaxRepairErrCntKey] = val
}
if value = r.FormValue(clusterCreateTimeKey); value != "" {
noParams = false
params[clusterCreateTimeKey] = value
}
if value = extractDataNodesetSelector(r); value != "" {
noParams = false
params[dataNodesetSelectorKey] = value
}
if value = extractMetaNodesetSelector(r); value != "" {
noParams = false
params[metaNodesetSelectorKey] = value
}
if value = extractDataNodeSelector(r); value != "" {
noParams = false
params[dataNodeSelectorKey] = value
}
if value = extractMetaNodeSelector(r); value != "" {
noParams = false
params[metaNodeSelectorKey] = value
}
if noParams {
err = keyNotFound(nodeDeleteBatchCountKey)
return
}
return
}
func validateRequestToCreateMetaPartition(r *http.Request) (volName string, count int, err error) {
if err = r.ParseForm(); err != nil {
return
}
if countStr := r.FormValue(countKey); countStr == "" {
err = keyNotFound(countKey)
return
} else if count, err = strconv.Atoi(countStr); err != nil || count == 0 {
err = unmatchedKey(countKey)
return
}
if volName, err = extractName(r); err != nil {
return
}
return
}
func parseAndExtractPartitionInfo(r *http.Request) (partitionID uint64, err error) {
if err = r.ParseForm(); err != nil {
return
}
if partitionID, err = extractMetaPartitionID(r); err != nil {
return
}
return
}
func extractMetaPartitionID(r *http.Request) (partitionID uint64, err error) {
var value string
if value = r.FormValue(idKey); value == "" {
err = keyNotFound(idKey)
return
}
return strconv.ParseUint(value, 10, 64)
}
func extractAuthKey(r *http.Request) (authKey string, err error) {
if authKey = r.FormValue(volAuthKey); authKey == "" {
err = keyNotFound(volAuthKey)
return
}
return
}
func extractClientIDKey(r *http.Request) (clientIDKey string, err error) {
if clientIDKey = r.FormValue(ClientIDKey); clientIDKey == "" {
err = keyNotFound(ClientIDKey)
return
}
return
}
func parseVolStatReq(r *http.Request) (name string, ver int, byMeta bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
name, err = extractName(r)
if err != nil {
return
}
ver, err = extractUint(r, clientVersion)
if err != nil {
return
}
byMeta, err = extractBoolWithDefault(r, CountByMeta, false)
if err != nil {
return
}
return
}
func parseQosInfo(r *http.Request) (info *proto.ClientReportLimitInfo, err error) {
info = proto.NewClientReportLimitInfo()
var body []byte
if body, err = io.ReadAll(r.Body); err != nil {
return
}
// log.LogInfof("action[parseQosInfo] body len:[%v],crc:[%v]", len(body), crc32.ChecksumIEEE(body))
err = json.Unmarshal(body, info)
return
}
func parseAndExtractName(r *http.Request) (name string, err error) {
if err = r.ParseForm(); err != nil {
return
}
return extractName(r)
}
func extractName(r *http.Request) (name string, err error) {
if name = r.FormValue(nameKey); name == "" {
err = keyNotFound(nameKey)
return
}
if !volNameRegexp.MatchString(name) {
return "", errors.New("name can only be number and letters")
}
return
}
func extractUint(r *http.Request, key string) (val int, err error) {
var str string
var valParsed int64
if str = r.FormValue(key); str == "" {
return 0, nil
}
if valParsed, err = strconv.ParseInt(str, 10, 32); err != nil || valParsed < 0 {
return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
}
val = int(valParsed)
return val, nil
}
func extractPositiveUint(r *http.Request, key string) (val int, err error) {
var str string
if str = r.FormValue(key); str == "" {
return 0, fmt.Errorf("args [%s] is not legal", key)
}
if val, err = strconv.Atoi(str); err != nil || val <= 0 {
return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
}
return val, nil
}
func extractUint64(r *http.Request, key string) (val uint64, err error) {
var str string
if str = r.FormValue(key); str == "" {
return 0, nil
}
if val, err = strconv.ParseUint(str, 10, 64); err != nil || val < 0 {
return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
}
return val, nil
}
func extractUint32(r *http.Request, key string) (val uint32, err error) {
var str string
if str = r.FormValue(key); str == "" {
return 0, nil
}
var tmp uint64
if tmp, err = strconv.ParseUint(str, 10, 32); err != nil || val < 0 {
return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
}
return uint32(tmp), nil
}
func extractPositiveUint64(r *http.Request, key string) (val uint64, err error) {
var str string
if str = r.FormValue(key); str == "" {
return 0, fmt.Errorf("args [%s] is not legal", key)
}
if val, err = strconv.ParseUint(str, 10, 64); err != nil || val <= 0 {
return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
}
return val, nil
}
func extractStr(r *http.Request, key string) (val string) {
return r.FormValue(key)
}
func extractOwner(r *http.Request) (owner string, err error) {
if owner = r.FormValue(volOwnerKey); owner == "" {
err = keyNotFound(volOwnerKey)
return
}
if !ownerRegexp.MatchString(owner) {
return "", errors.New("owner can only be number and letters")
}
return
}
func parseAndCheckTicket(r *http.Request, key []byte, volName string) (jobj proto.APIAccessReq, ticket cryptoutil.Ticket, ts int64, err error) {
var plaintext []byte
if err = r.ParseForm(); err != nil {
return
}
if plaintext, err = extractClientReqInfo(r); err != nil {
return
}
if err = json.Unmarshal([]byte(plaintext), &jobj); err != nil {
return
}
if err = proto.VerifyAPIAccessReqIDs(&jobj); err != nil {
return
}
ticket, ts, err = extractTicketMess(&jobj, key, volName)
return
}
func extractClientReqInfo(r *http.Request) (plaintext []byte, err error) {
var message string
if err = r.ParseForm(); err != nil {
return
}
if message = r.FormValue(proto.ClientMessage); message == "" {
err = keyNotFound(proto.ClientMessage)
return
}
if plaintext, err = cryptoutil.Base64Decode(message); err != nil {
return
}
return
}
func extractTicketMess(req *proto.APIAccessReq, key []byte, volName string) (ticket cryptoutil.Ticket, ts int64, err error) {
if ticket, err = proto.ExtractTicket(req.Ticket, key); err != nil {
err = fmt.Errorf("extractTicket failed: %s", err.Error())
return
}
if time.Now().Unix() >= ticket.Exp {
err = proto.ErrExpiredTicket
return
}
if ts, err = proto.ParseVerifier(req.Verifier, ticket.SessionKey.Key); err != nil {
err = fmt.Errorf("parseVerifier failed: %s", err.Error())
return
}
if err = proto.CheckAPIAccessCaps(&ticket, proto.APIRsc, req.Type, proto.APIAccess); err != nil {
err = fmt.Errorf("CheckAPIAccessCaps failed: %s", err.Error())
return
}
if err = proto.CheckVOLAccessCaps(&ticket, volName, proto.VOLAccess, proto.MasterNode); err != nil {
err = fmt.Errorf("CheckVOLAccessCaps failed: %s", err.Error())
return
}
return
}
func checkTicket(encodedTicket string, key []byte, Type proto.MsgType) (ticket cryptoutil.Ticket, err error) {
if ticket, err = proto.ExtractTicket(encodedTicket, key); err != nil {
err = fmt.Errorf("extractTicket failed: %s", err.Error())
return
}
if time.Now().Unix() >= ticket.Exp {
err = proto.ErrExpiredTicket
return
}
if err = proto.CheckAPIAccessCaps(&ticket, proto.APIRsc, Type, proto.APIAccess); err != nil {
err = fmt.Errorf("CheckAPIAccessCaps failed: %s", err.Error())
return
}
return
}
func newSuccessHTTPReply(data interface{}) *proto.HTTPReply {
return &proto.HTTPReply{Code: proto.ErrCodeSuccess, Msg: proto.ErrSuc.Error(), Data: data}
}
func newErrHTTPReply(err error) *proto.HTTPReply {
if err == nil {
return newSuccessHTTPReply("")
}
code, ok := proto.Err2CodeMap[err]
if ok {
return &proto.HTTPReply{Code: code, Msg: err.Error()}
}
return &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()}
}
func sendOkReply(w http.ResponseWriter, r *http.Request, httpReply *proto.HTTPReply) (err error) {
switch httpReply.Data.(type) {
case *DataPartition:
dp := httpReply.Data.(*DataPartition)
dp.RLock()
defer dp.RUnlock()
case *MetaPartition:
mp := httpReply.Data.(*MetaPartition)
mp.RLock()
defer mp.RUnlock()
case *MetaNode:
mn := httpReply.Data.(*MetaNode)
mn.RLock()
defer mn.RUnlock()
case *DataNode:
dn := httpReply.Data.(*DataNode)
dn.RLock()
defer dn.RUnlock()
default:
// do nothing
}
reply, err := json.Marshal(httpReply)
if err != nil {
log.LogErrorf("fail to marshal http reply. URL[%v],remoteAddr[%v] err:[%v]", r.URL, r.RemoteAddr, err)
http.Error(w, "fail to marshal http reply", http.StatusBadRequest)
return
}
if acceptEncoding := r.Header.Get(proto.HeaderAcceptEncoding); acceptEncoding != "" {
if compressed, errx := compressor.New(acceptEncoding).Compress(reply); errx == nil {
w.Header().Set(proto.HeaderContentEncoding, acceptEncoding)
reply = compressed
}
}
send(w, r, reply)
return
}
func send(w http.ResponseWriter, r *http.Request, reply []byte) {
w.Header().Set("content-type", "application/json")
w.Header().Set("Content-Length", strconv.Itoa(len(reply)))
if _, err := w.Write(reply); err != nil {
log.LogErrorf("fail to write http len[%d].URL[%v],remoteAddr[%v] err:[%v]", len(reply), r.URL, r.RemoteAddr, err)
return
}
log.LogInfof("URL[%v],remoteAddr[%v],response ok", r.URL, r.RemoteAddr)
return
}
func sendErrReply(w http.ResponseWriter, r *http.Request, httpReply *proto.HTTPReply) {
log.LogInfof("URL[%v],remoteAddr[%v],response", r.URL, r.RemoteAddr)
reply, err := json.Marshal(httpReply)
if err != nil {
log.LogErrorf("fail to marshal http reply. URL[%v],remoteAddr[%v] err:[%v]", r.URL, r.RemoteAddr, err)
http.Error(w, "fail to marshal http reply", http.StatusBadRequest)
return
}
w.Header().Set("content-type", "application/json")
w.Header().Set("Content-Length", strconv.Itoa(len(reply)))
if _, err = w.Write(reply); err != nil {
log.LogErrorf("fail to write http len[%d].URL[%v],remoteAddr[%v] err:[%v]", len(reply), r.URL, r.RemoteAddr, err)
}
return
}
func parseRequestToUpdateDecommissionLimit(r *http.Request) (limit uint64, err error) {
if err = r.ParseForm(); err != nil {
return
}
var value string
if value = r.FormValue(decommissionLimit); value == "" {
err = keyNotFound(decommissionLimit)
return
}
limit, err = strconv.ParseUint(value, 10, 32)
if err != nil {
return
}
return
}
func parseSetConfigParam(r *http.Request) (key string, value string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if value = r.FormValue(cfgmetaPartitionInodeIdStep); value == "" {
err = keyNotFound("config")
return
}
key = cfgmetaPartitionInodeIdStep
log.LogInfo("parseSetConfigParam success.")
return
}
func parseGetConfigParam(r *http.Request) (key string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if key = r.FormValue(configKey); key == "" {
err = keyNotFound("config")
return
}
log.LogInfo("parseGetConfigParam success.")
return
}
func parserSetQuotaParam(r *http.Request, req *proto.SetMasterQuotaReuqest) (err error) {
if err = r.ParseForm(); err != nil {
return
}
if req.VolName, err = extractName(r); err != nil {
return
}
if req.MaxFiles, err = extractUint64WithDefault(r, MaxFilesKey, math.MaxUint64); err != nil {
return
}
if req.MaxBytes, err = extractUint64WithDefault(r, MaxBytesKey, math.MaxUint64); err != nil {
return
}
var body []byte
if body, err = io.ReadAll(r.Body); err != nil {
return
}
if err = json.Unmarshal(body, &req.PathInfos); err != nil {
return
}
log.LogInfo("parserSetQuotaParam success.")
return
}
func parserUpdateQuotaParam(r *http.Request, req *proto.UpdateMasterQuotaReuqest) (err error) {
if err = r.ParseForm(); err != nil {
return
}
if req.VolName, err = extractName(r); err != nil {
return
}
if req.QuotaId, err = extractQuotaId(r); err != nil {
return
}
if req.MaxFiles, err = extractUint64WithDefault(r, MaxFilesKey, math.MaxUint64); err != nil {
return
}
if req.MaxBytes, err = extractUint64WithDefault(r, MaxBytesKey, math.MaxUint64); err != nil {
return
}
log.LogInfo("parserUpdateQuotaParam success.")
return
}
func parseDeleteQuotaParam(r *http.Request) (volName string, quotaId uint32, err error) {
if err = r.ParseForm(); err != nil {
return
}
if volName, err = extractName(r); err != nil {
return
}
if quotaId, err = extractQuotaId(r); err != nil {
return
}
return
}
func parseGetQuotaParam(r *http.Request) (volName string, quotaId uint32, err error) {
if err = r.ParseForm(); err != nil {
return
}
if volName, err = extractName(r); err != nil {
return
}
if quotaId, err = extractQuotaId(r); err != nil {
return
}
return
}
func extractPath(r *http.Request) (fullPath string, err error) {
if fullPath = r.FormValue(fullPathKey); fullPath == "" {
err = keyNotFound(nameKey)
return
}
return
}
func extractQuotaId(r *http.Request) (quotaId uint32, err error) {
var value string
if value = r.FormValue(quotaKey); value == "" {
err = keyNotFound(quotaKey)
return
}
tmp, err := strconv.ParseUint(value, 10, 32)
quotaId = uint32(tmp)
return
}
func extractInodeId(r *http.Request) (inode uint64, err error) {
var value string
if value = r.FormValue(inodeKey); value == "" {
err = keyNotFound(inodeKey)
return
}
return strconv.ParseUint(value, 10, 64)
}
func parseRequestToUpdateDecommissionDiskFactor(r *http.Request) (factor float64, err error) {
if err = r.ParseForm(); err != nil {
return
}
var value string
if value = r.FormValue(decommissionDiskFactor); value == "" {
err = keyNotFound(decommissionDiskFactor)
return
}
return strconv.ParseFloat(value, 64)
}
func parseS3QosReq(r *http.Request, req *proto.S3QosRequest) (err error) {
var body []byte
if body, err = io.ReadAll(r.Body); err != nil {
return
}
if err = json.Unmarshal(body, &req); err != nil {
return
}
log.LogInfo("parseS3QosReq success.")
return
}
package master
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
"golang.org/x/time/rate"
)
const (
defaultApiLimitBurst = 1
)
type ApiLimitInfo struct {
ApiName string `json:"api_name"`
QueryPath string `json:"query_path"`
Limit uint32 `json:"limit"` // qps
LimiterTimeout uint32 `json:"limiter_timeout"`
Limiter *rate.Limiter `json:"-"`
}
func (li *ApiLimitInfo) InitLimiter() {
li.Limiter = rate.NewLimiter(rate.Limit(li.Limit), defaultApiLimitBurst)
}
type ApiLimiter struct {
m sync.RWMutex
limiterInfos map[string]*ApiLimitInfo
}
func newApiLimiter() *ApiLimiter {
return &ApiLimiter{
limiterInfos: make(map[string]*ApiLimitInfo),
}
}
func (l *ApiLimiter) clear() {
for k := range l.limiterInfos {
delete(l.limiterInfos, k)
}
}
func (l *ApiLimiter) Clear() {
l.m.Lock()
defer l.m.Unlock()
l.clear()
}
func (l *ApiLimiter) Replace(limiterInfos map[string]*ApiLimitInfo) {
l.m.Lock()
defer l.m.Unlock()
l.clear()
for k, v := range limiterInfos {
l.limiterInfos[k] = v
}
}
func (l *ApiLimiter) SetLimiter(apiName string, Limit uint32, LimiterTimeout uint32) (err error) {
var normalizedName string
var qPath string
if err, normalizedName, qPath = l.IsApiNameValid(apiName); err != nil {
return err
}
lInfo := &ApiLimitInfo{
ApiName: normalizedName,
QueryPath: qPath,
Limit: Limit,
LimiterTimeout: LimiterTimeout,
}
lInfo.InitLimiter()
l.m.Lock()
l.limiterInfos[qPath] = lInfo
l.m.Unlock()
return nil
}
func (l *ApiLimiter) RmLimiter(apiName string) (err error) {
var qPath string
if err, _, qPath = l.IsApiNameValid(apiName); err != nil {
return err
}
l.m.Lock()
delete(l.limiterInfos, qPath)
l.m.Unlock()
return nil
}
func (l *ApiLimiter) Wait(qPath string) (err error) {
var lInfo *ApiLimitInfo
var ok bool
l.m.RLock()
if lInfo, ok = l.limiterInfos[qPath]; !ok {
l.m.RUnlock()
log.LogDebugf("no api limiter for api[%v]", qPath)
return nil
}
l.m.RUnlock()
ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(lInfo.LimiterTimeout))
defer cancel()
err = lInfo.Limiter.Wait(ctx)
if err != nil {
log.LogErrorf("wait api limiter for api[%v] failed: %v", qPath, err)
return err
}
log.LogDebugf("wait api limiter for api[%v]", qPath)
return nil
}
func (l *ApiLimiter) IsApiNameValid(name string) (err error, normalizedName, qPath string) {
normalizedName = strings.ToLower(name)
var ok bool
if qPath, ok = proto.GApiInfo[normalizedName]; ok {
return nil, normalizedName, qPath
}
return fmt.Errorf("api name [%v] is not valid", name), normalizedName, qPath
}
func (l *ApiLimiter) IsFollowerLimiter(qPath string) bool {
if qPath == proto.AdminGetIP || qPath == proto.ClientDataPartitions {
return true
}
return false
}
func (l *ApiLimiter) updateLimiterInfoFromLeader(value []byte) {
limiterInfos := make(map[string]*ApiLimitInfo)
if err := json.Unmarshal(value, &limiterInfos); err != nil {
log.LogErrorf("action[updateLimiterInfoFromLeader], unmarshal err:%v", err.Error())
return
}
for _, v := range limiterInfos {
v.InitLimiter()
}
l.m.Lock()
l.limiterInfos = limiterInfos
l.m.Unlock()
log.LogInfof("action[updateLimiterInfoFromLeader], limiter info[%v]", value)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"context"
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"regexp"
"strconv"
"strings"
"sync/atomic"
"time"
"golang.org/x/time/rate"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/compressor"
"github.com/cubefs/cubefs/util/cryptoutil"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/iputil"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
func apiToMetricsName(api string) (reqMetricName string) {
var builder strings.Builder
builder.WriteString("req")
// prometheus metric not allow '/' in name, need to transfer to '_'
builder.WriteString(strings.Replace(api, "/", "_", -1))
return builder.String()
}
func doStatAndMetric(statName string, metric *exporter.TimePointCount, err error, metricLabels map[string]string) {
if metric == nil {
return
}
if metricLabels == nil {
metric.Set(err)
} else {
metric.SetWithLabels(err, metricLabels)
}
startTime := metric.GetStartTime()
stat.EndStat(statName, err, &startTime, 1)
}
// NodeView provides the view of the data or meta node.
type NodeView struct {
Addr string
Status bool
ID uint64
IsWritable bool
}
// NodeView provides the view of the data or meta node.
type InvalidNodeView struct {
Addr string
ID uint64
OldID uint64
NodeType string
}
// TopologyView provides the view of the topology view of the cluster
type TopologyView struct {
Zones []*ZoneView
}
type NodeSetView struct {
DataNodeLen int
MetaNodeLen int
MetaNodes []proto.NodeView
DataNodes []proto.NodeView
}
func newNodeSetView(dataNodeLen, metaNodeLen int) *NodeSetView {
return &NodeSetView{DataNodes: make([]proto.NodeView, 0), MetaNodes: make([]proto.NodeView, 0), DataNodeLen: dataNodeLen, MetaNodeLen: metaNodeLen}
}
// ZoneView define the view of zone
type ZoneView struct {
Name string
Status string
DataNodesetSelector string
MetaNodesetSelector string
NodeSet map[uint64]*NodeSetView
}
func newZoneView(name string) *ZoneView {
return &ZoneView{NodeSet: make(map[uint64]*NodeSetView, 0), Name: name}
}
type badPartitionView = proto.BadPartitionView
func (m *Server) setClusterInfo(w http.ResponseWriter, r *http.Request) {
var (
dirLimit uint32
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetClusterInfo))
defer func() {
doStatAndMetric(proto.AdminSetClusterInfo, metric, err, nil)
}()
if dirLimit, err = parseAndExtractDirLimit(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dirLimit < proto.MinDirChildrenNumLimit {
dirLimit = proto.MinDirChildrenNumLimit
}
if err = m.cluster.setClusterInfo(dirLimit); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set dir limit(min:%v, max:%v) to %v successfully",
proto.MinDirChildrenNumLimit, math.MaxUint32, dirLimit)))
}
func (m *Server) getMonitorPushAddr(w http.ResponseWriter, r *http.Request) {
var (
addr string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetMonitorPushAddr))
defer func() {
doStatAndMetric(proto.AdminGetMonitorPushAddr, metric, err, nil)
}()
addr = m.cluster.getMonitorPushAddr()
sendOkReply(w, r, newSuccessHTTPReply(addr))
}
// Set the threshold of the memory usage on each meta node.
// If the memory usage reaches this threshold, then all the mata partition will be marked as readOnly.
func (m *Server) setMetaNodeThreshold(w http.ResponseWriter, r *http.Request) {
var (
threshold float64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetMetaNodeThreshold))
defer func() {
doStatAndMetric(proto.AdminSetMetaNodeThreshold, metric, err, nil)
}()
if threshold, err = parseAndExtractThreshold(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.setMetaNodeThreshold(float32(threshold)); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set threshold to %v successfully", threshold)))
}
// Turn on or off the automatic allocation of the data partitions.
// If DisableAutoAllocate == off, then we WILL NOT automatically allocate new data partitions for the volume when:
// 1. the used space is below the max capacity,
// 2. and the number of r&w data partition is less than 20.
//
// If DisableAutoAllocate == on, then we WILL automatically allocate new data partitions for the volume when:
// 1. the used space is below the max capacity,
// 2. and the number of r&w data partition is less than 20.
func (m *Server) setupAutoAllocation(w http.ResponseWriter, r *http.Request) {
var (
status bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminClusterFreeze))
defer func() {
doStatAndMetric(proto.AdminClusterFreeze, metric, err, nil)
}()
if status, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.setDisableAutoAllocate(status); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set DisableAutoAllocate to %v successfully", status)))
}
func (m *Server) forbidVolume(w http.ResponseWriter, r *http.Request) {
var (
status bool
name string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolForbidden))
defer func() {
doStatAndMetric(proto.AdminVolForbidden, metric, err, nil)
if err != nil {
log.LogErrorf("set volume forbidden failed, error: %v", err)
} else {
log.LogInfof("set volume forbidden to (%v) success", status)
}
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if status, err = parseAndExtractForbidden(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
vol, err := m.cluster.getVol(name)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
oldForbiden := vol.Forbidden
vol.Forbidden = status
defer func() {
if err != nil {
vol.Forbidden = oldForbiden
}
}()
if err = m.cluster.syncUpdateVol(vol); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if status {
// set data partition status to write only
vol.setDpRdOnly()
// set meta partition status to read only
vol.setMpRdOnly()
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set volume forbidden to (%v) success", status)))
}
func (m *Server) setEnableAuditLogForVolume(w http.ResponseWriter, r *http.Request) {
var (
status bool
name string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolEnableAuditLog))
defer func() {
doStatAndMetric(proto.AdminVolEnableAuditLog, metric, err, nil)
if err != nil {
log.LogErrorf("set volume aduit log failed, error: %v", err)
} else {
log.LogInfof("set volume aduit log to (%v) success", status)
}
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if status, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
vol, err := m.cluster.getVol(name)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
oldEnable := vol.EnableAuditLog
vol.EnableAuditLog = status
defer func() {
if err != nil {
vol.EnableAuditLog = oldEnable
}
}()
if err = m.cluster.syncUpdateVol(vol); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set volume audit log to (%v) success", status)))
}
func (m *Server) setupForbidMetaPartitionDecommission(w http.ResponseWriter, r *http.Request) {
var (
status bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminClusterForbidMpDecommission))
defer func() {
doStatAndMetric(proto.AdminClusterForbidMpDecommission, metric, err, nil)
if err != nil {
log.LogErrorf("set ForbidMpDecommission failed, error: %v", err)
} else {
log.LogInfof("set ForbidMpDecommission to (%v) success", status)
}
}()
if status, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.setForbidMpDecommission(status); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set ForbidMpDecommission to %v successfully", status)))
}
// View the topology of the cluster.
func (m *Server) getTopology(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetTopologyView))
defer func() {
doStatAndMetric(proto.GetTopologyView, metric, nil, nil)
}()
tv := &TopologyView{
Zones: make([]*ZoneView, 0),
}
zones := m.cluster.t.getAllZones()
for _, zone := range zones {
cv := newZoneView(zone.name)
cv.Status = zone.getStatusToString()
cv.DataNodesetSelector = zone.GetDataNodesetSelector()
cv.MetaNodesetSelector = zone.GetMetaNodesetSelector()
tv.Zones = append(tv.Zones, cv)
nsc := zone.getAllNodeSet()
for _, ns := range nsc {
nsView := newNodeSetView(ns.dataNodeLen(), ns.metaNodeLen())
cv.NodeSet[ns.ID] = nsView
ns.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
nsView.DataNodes = append(nsView.DataNodes, proto.NodeView{
ID: dataNode.ID, Addr: dataNode.Addr,
DomainAddr: dataNode.DomainAddr, IsActive: dataNode.isActive, IsWritable: dataNode.isWriteAble(),
})
return true
})
ns.metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
nsView.MetaNodes = append(nsView.MetaNodes, proto.NodeView{
ID: metaNode.ID, Addr: metaNode.Addr,
DomainAddr: metaNode.DomainAddr, IsActive: metaNode.IsActive, IsWritable: metaNode.isWritable(),
})
return true
})
}
}
sendOkReply(w, r, newSuccessHTTPReply(tv))
}
func (m *Server) updateZone(w http.ResponseWriter, r *http.Request) {
var (
name string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UpdateZone))
defer func() {
doStatAndMetric(proto.UpdateZone, metric, err, nil)
}()
if name = r.FormValue(nameKey); name == "" {
err = keyNotFound(nameKey)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
status, err := extractStatus(r)
dataNodesetSelector := extractDataNodesetSelector(r)
metaNodesetSelector := extractMetaNodesetSelector(r)
dataNodeSelector := extractDataNodeSelector(r)
metaNodeSelector := extractMetaNodeSelector(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
zone, err := m.cluster.t.getZone(name)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeZoneNotExists, Msg: err.Error()})
return
}
if status {
zone.setStatus(normalZone)
} else {
zone.setStatus(unavailableZone)
}
err = zone.updateNodesetSelector(m.cluster, dataNodesetSelector, metaNodesetSelector)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
err = m.updateZoneNodeSelector(zone.name, dataNodeSelector, metaNodeSelector)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("update zone status to [%v] successfully", status)))
}
func (m *Server) listZone(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetAllZones))
defer func() {
doStatAndMetric(proto.GetAllZones, metric, nil, nil)
}()
zones := m.cluster.t.getAllZones()
zoneViews := make([]*ZoneView, 0)
for _, zone := range zones {
cv := newZoneView(zone.name)
cv.Status = zone.getStatusToString()
cv.DataNodesetSelector = zone.GetDataNodesetSelector()
cv.MetaNodesetSelector = zone.GetMetaNodesetSelector()
zoneViews = append(zoneViews, cv)
}
sendOkReply(w, r, newSuccessHTTPReply(zoneViews))
}
func (m *Server) listNodeSets(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetAllNodeSets))
defer func() {
doStatAndMetric(proto.GetAllNodeSets, metric, nil, nil)
}()
var zones []*Zone
// if zoneName is empty, list all nodeSets, otherwise list node sets in the specified zone
zoneName := r.FormValue(zoneNameKey)
if zoneName == "" {
zones = m.cluster.t.getAllZones()
} else {
zone, err := m.cluster.t.getZone(zoneName)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeZoneNotExists, Msg: err.Error()})
return
}
zones = []*Zone{zone}
}
nodeSetStats := make([]*proto.NodeSetStat, 0)
for _, zone := range zones {
nsc := zone.getAllNodeSet()
for _, ns := range nsc {
nsStat := &proto.NodeSetStat{
ID: ns.ID,
Capacity: ns.Capacity,
Zone: zone.name,
DataNodeNum: ns.dataNodeLen(),
MetaNodeNum: ns.metaNodeLen(),
}
nodeSetStats = append(nodeSetStats, nsStat)
}
}
sendOkReply(w, r, newSuccessHTTPReply(nodeSetStats))
}
func (m *Server) getNodeSet(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetNodeSet))
defer func() {
doStatAndMetric(proto.GetNodeSet, metric, nil, nil)
}()
nodeSetStr := r.FormValue(nodesetIdKey)
if nodeSetStr == "" {
err := keyNotFound(nodesetIdKey)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
nodeSetId, err := strconv.ParseUint(nodeSetStr, 10, 64)
if err != nil {
err = fmt.Errorf("invalid nodeSetId: %v", nodeSetStr)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
ns, err := m.cluster.t.getNodeSetByNodeSetId(nodeSetId)
if err != nil {
err := nodeSetNotFound(nodeSetId)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeNodeSetNotExists, Msg: err.Error()})
return
}
nsStat := &proto.NodeSetStatInfo{
ID: ns.ID,
Capacity: ns.Capacity,
Zone: ns.zoneName,
DataNodeSelector: ns.GetDataNodeSelector(),
MetaNodeSelector: ns.GetMetaNodeSelector(),
}
ns.dataNodes.Range(func(key, value interface{}) bool {
dn := value.(*DataNode)
nsStat.DataNodes = append(nsStat.DataNodes, &proto.NodeStatView{
Addr: dn.Addr,
Status: dn.isActive,
DomainAddr: dn.DomainAddr,
ID: dn.ID,
IsWritable: dn.isWriteAble(),
Total: dn.Total,
Used: dn.Used,
Avail: dn.Total - dn.Used,
})
return true
})
ns.metaNodes.Range(func(key, value interface{}) bool {
mn := value.(*MetaNode)
nsStat.MetaNodes = append(nsStat.MetaNodes, &proto.NodeStatView{
Addr: mn.Addr,
Status: mn.IsActive,
DomainAddr: mn.DomainAddr,
ID: mn.ID,
IsWritable: mn.isWritable(),
Total: mn.Total,
Used: mn.Used,
Avail: mn.Total - mn.Used,
})
return true
})
sendOkReply(w, r, newSuccessHTTPReply(nsStat))
}
func (m *Server) updateNodeSet(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.UpdateNodeSet))
defer func() {
doStatAndMetric(proto.UpdateNodeSet, metric, nil, nil)
}()
nodeSetStr := r.FormValue(nodesetIdKey)
if nodeSetStr == "" {
err := keyNotFound(nodesetIdKey)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
nodeSetId, err := strconv.ParseUint(nodeSetStr, 10, 64)
if err != nil {
err = fmt.Errorf("invalid nodeSetId: %v", nodeSetStr)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
ns, err := m.cluster.t.getNodeSetByNodeSetId(nodeSetId)
if err != nil {
err := nodeSetNotFound(nodeSetId)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeNodeSetNotExists, Msg: err.Error()})
return
}
dataNodeSelector := extractDataNodeSelector(r)
metaNodeSelector := extractMetaNodeSelector(r)
needSync := false
if dataNodeSelector != "" && dataNodeSelector != ns.GetDataNodeSelector() {
ns.SetDataNodeSelector(dataNodeSelector)
needSync = true
}
if metaNodeSelector != "" && metaNodeSelector != ns.GetMetaNodeSelector() {
ns.SetMetaNodeSelector(metaNodeSelector)
needSync = true
}
if needSync {
err = m.cluster.syncUpdateNodeSet(ns)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
func (m *Server) clusterStat(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminClusterStat))
defer func() {
doStatAndMetric(proto.AdminClusterStat, metric, nil, nil)
}()
cs := &proto.ClusterStatInfo{
DataNodeStatInfo: m.cluster.dataNodeStatInfo,
MetaNodeStatInfo: m.cluster.metaNodeStatInfo,
ZoneStatInfo: make(map[string]*proto.ZoneStat, 0),
}
for zoneName, zoneStat := range m.cluster.zoneStatInfos {
cs.ZoneStatInfo[zoneName] = zoneStat
}
sendOkReply(w, r, newSuccessHTTPReply(cs))
}
func (m *Server) UidOperate(w http.ResponseWriter, r *http.Request) {
var (
uid uint32
err error
volName string
vol *Vol
op uint64
value string
capSize uint64
uidList []*proto.UidSpaceInfo
uidInfo *proto.UidSpaceInfo
ok bool
)
if volName, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if value = r.FormValue(OperateKey); value == "" {
err = keyNotFound(OperateKey)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
op, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = fmt.Errorf("parseUintParam %s-%s is not legal, err %s", OperateKey, value, err.Error())
return
}
if op != util.UidLimitList {
if uid, err = extractUint32(r, UIDKey); err != nil {
err = keyNotFound(UIDKey)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
if op == util.UidAddLimit {
if capSize, err = extractPositiveUint64(r, CapacityKey); err != nil {
err = keyNotFound(CapacityKey)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
log.LogDebugf("uidOperate. name %v op %v uid %v", volName, op, uid)
if vol, err = m.cluster.getVol(volName); err != nil {
log.LogDebugf("aclOperate. name %v not found", volName)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
ok = true
switch op {
case util.UidGetLimit:
ok, uidInfo = vol.uidSpaceManager.checkUid(uid)
uidList = append(uidList, uidInfo)
case util.AclAddIP:
ok = vol.uidSpaceManager.addUid(uid, capSize)
case util.AclDelIP:
ok = vol.uidSpaceManager.removeUid(uid)
case util.AclListIP:
uidList = vol.uidSpaceManager.listAll()
default:
// do nothing
}
rsp := &proto.UidSpaceRsp{
OK: ok,
UidSpaceArr: uidList,
}
_ = sendOkReply(w, r, newSuccessHTTPReply(rsp))
}
func (m *Server) aclOperate(w http.ResponseWriter, r *http.Request) {
var (
ip string
err error
volName string
vol *Vol
op uint64
value string
ok, res bool
ipList []*proto.AclIpInfo
)
if volName, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if value = r.FormValue(OperateKey); value == "" {
err = keyNotFound(OperateKey)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
op, err = strconv.ParseUint(value, 10, 64)
if err != nil {
err = fmt.Errorf("parseUintParam %s-%s is not legal, err %s", OperateKey, value, err.Error())
return
}
if op != util.AclListIP {
if ip = r.FormValue(IPKey); ip == "" {
err = keyNotFound(IPKey)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
log.LogDebugf("aclOperate. name %v op %v ip %v", volName, op, ip)
if vol, err = m.cluster.getVol(volName); err != nil {
log.LogDebugf("aclOperate. name %v not found", volName)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
ok = true
opAclRes := vol.aclMgr.aclOperate(op, ip)
switch op {
case util.AclCheckIP:
if ipList, res = opAclRes.([]*proto.AclIpInfo); !res {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("inner error")))
return
}
if len(ipList) > 0 {
ok = false
}
case util.AclAddIP, util.AclDelIP:
if opAclRes != nil {
if err, res = opAclRes.(error); !res {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("inner error")))
return
}
}
case util.AclListIP:
if ipList, res = opAclRes.([]*proto.AclIpInfo); !res {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("inner error")))
return
}
default:
// do nothing
}
rsp := &proto.AclRsp{
OK: ok,
List: ipList,
}
_ = sendOkReply(w, r, newSuccessHTTPReply(rsp))
}
func (m *Server) getCluster(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetCluster))
defer func() {
doStatAndMetric(proto.AdminGetCluster, metric, nil, nil)
}()
cv := &proto.ClusterView{
Name: m.cluster.Name,
CreateTime: time.Unix(m.cluster.CreateTime, 0).Format(proto.TimeFormat),
LeaderAddr: m.leaderInfo.addr,
DisableAutoAlloc: m.cluster.DisableAutoAllocate,
ForbidMpDecommission: m.cluster.ForbidMpDecommission,
MetaNodeThreshold: m.cluster.cfg.MetaNodeThreshold,
Applied: m.fsm.applied,
MaxDataPartitionID: m.cluster.idAlloc.dataPartitionID,
MaxMetaNodeID: m.cluster.idAlloc.commonID,
MaxMetaPartitionID: m.cluster.idAlloc.metaPartitionID,
MasterNodes: make([]proto.NodeView, 0),
MetaNodes: make([]proto.NodeView, 0),
DataNodes: make([]proto.NodeView, 0),
VolStatInfo: make([]*proto.VolStatInfo, 0),
BadPartitionIDs: make([]proto.BadPartitionView, 0),
BadMetaPartitionIDs: make([]proto.BadPartitionView, 0),
}
vols := m.cluster.allVolNames()
cv.MasterNodes = m.cluster.allMasterNodes()
cv.MetaNodes = m.cluster.allMetaNodes()
cv.DataNodes = m.cluster.allDataNodes()
cv.DataNodeStatInfo = m.cluster.dataNodeStatInfo
cv.MetaNodeStatInfo = m.cluster.metaNodeStatInfo
for _, name := range vols {
stat, ok := m.cluster.volStatInfo.Load(name)
if !ok {
cv.VolStatInfo = append(cv.VolStatInfo, newVolStatInfo(name, 0, 0, 0, 0, 0))
continue
}
cv.VolStatInfo = append(cv.VolStatInfo, stat.(*volStatInfo))
}
cv.BadPartitionIDs = m.cluster.getBadDataPartitionsView()
cv.BadMetaPartitionIDs = m.cluster.getBadMetaPartitionsView()
sendOkReply(w, r, newSuccessHTTPReply(cv))
}
func (m *Server) getApiList(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetMasterApiList))
defer func() {
doStatAndMetric(proto.AdminGetMasterApiList, metric, nil, nil)
}()
sendOkReply(w, r, newSuccessHTTPReply(proto.GApiInfo))
}
func (m *Server) setApiQpsLimit(w http.ResponseWriter, r *http.Request) {
var (
name string
limit uint32
timeout uint32
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetApiQpsLimit))
defer func() {
doStatAndMetric(proto.AdminSetApiQpsLimit, metric, err, nil)
}()
if name, limit, timeout, err = parseRequestToSetApiQpsLimit(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.apiLimiter.SetLimiter(name, limit, timeout); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
// persist to rocksdb
var qPath string
if err, _, qPath = m.cluster.apiLimiter.IsApiNameValid(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.syncPutApiLimiterInfo(m.cluster.apiLimiter.IsFollowerLimiter(qPath)); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set api qps limit failed: %v", err)))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set api qps limit success: name: %v, limit: %v, timeout: %v",
name, limit, timeout)))
return
}
func (m *Server) getApiQpsLimit(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetMasterApiList))
defer func() {
doStatAndMetric(proto.AdminGetMasterApiList, metric, nil, nil)
}()
m.cluster.apiLimiter.m.RLock()
v, err := json.Marshal(m.cluster.apiLimiter.limiterInfos)
m.cluster.apiLimiter.m.RUnlock()
if err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("get api qps limit failed: %v", err)))
return
}
limiterInfos := make(map[string]*ApiLimitInfo)
json.Unmarshal(v, &limiterInfos)
sendOkReply(w, r, newSuccessHTTPReply(limiterInfos))
}
func (m *Server) rmApiQpsLimit(w http.ResponseWriter, r *http.Request) {
var (
name string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminRemoveApiQpsLimit))
defer func() {
doStatAndMetric(proto.AdminRemoveApiQpsLimit, metric, err, nil)
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.apiLimiter.RmLimiter(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
// persist to rocksdb
var qPath string
if err, _, qPath = m.cluster.apiLimiter.IsApiNameValid(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.syncPutApiLimiterInfo(m.cluster.apiLimiter.IsFollowerLimiter(qPath)); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set api qps limit failed: %v", err)))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("rm api qps limit success: name: %v",
name)))
}
func (m *Server) getIPAddr(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetIP))
defer func() {
doStatAndMetric(proto.AdminGetIP, metric, nil, nil)
}()
m.cluster.loadClusterValue()
batchCount := atomic.LoadUint64(&m.cluster.cfg.MetaNodeDeleteBatchCount)
limitRate := atomic.LoadUint64(&m.cluster.cfg.DataNodeDeleteLimitRate)
deleteSleepMs := atomic.LoadUint64(&m.cluster.cfg.MetaNodeDeleteWorkerSleepMs)
autoRepairRate := atomic.LoadUint64(&m.cluster.cfg.DataNodeAutoRepairLimitRate)
dirChildrenNumLimit := atomic.LoadUint32(&m.cluster.cfg.DirChildrenNumLimit)
dpMaxRepairErrCnt := atomic.LoadUint64(&m.cluster.cfg.DpMaxRepairErrCnt)
cInfo := &proto.ClusterInfo{
Cluster: m.cluster.Name,
MetaNodeDeleteBatchCount: batchCount,
MetaNodeDeleteWorkerSleepMs: deleteSleepMs,
DataNodeDeleteLimitRate: limitRate,
DataNodeAutoRepairLimitRate: autoRepairRate,
DpMaxRepairErrCnt: dpMaxRepairErrCnt,
DirChildrenNumLimit: dirChildrenNumLimit,
// Ip: strings.Split(r.RemoteAddr, ":")[0],
Ip: iputil.RealIP(r),
EbsAddr: m.bStoreAddr,
ServicePath: m.servicePath,
ClusterUuid: m.cluster.clusterUuid,
ClusterUuidEnable: m.cluster.clusterUuidEnable,
}
sendOkReply(w, r, newSuccessHTTPReply(cInfo))
}
func (m *Server) createMetaPartition(w http.ResponseWriter, r *http.Request) {
var (
vol *Vol
volName string
count int
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreateMetaPartition))
defer func() {
doStatAndMetric(proto.AdminCreateMetaPartition, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, count, err = validateRequestToCreateMetaPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if vol.status() == proto.VolStatusMarkDelete {
log.LogErrorf("action[createMetaPartition] vol[%s] is marked delete ", vol.Name)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if vol.Forbidden {
log.LogErrorf("action[createMetaPartition] vol[%s] is forbidden", vol.Name)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = vol.addMetaPartitions(m.cluster, count); err != nil {
log.LogErrorf("create meta partition fail: volume(%v) err(%v)", volName, err)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprint("create meta partition successfully")))
}
func parsePreloadDpReq(r *http.Request, preload *DataPartitionPreLoad) (err error) {
if err = r.ParseForm(); err != nil {
return
}
preload.preloadZoneName = r.FormValue(zoneNameKey)
if preload.PreloadCacheTTL, err = extractPositiveUint64(r, cacheTTLKey); err != nil {
return
}
if preload.preloadCacheCapacity, err = extractPositiveUint(r, volCapacityKey); err != nil {
return
}
if preload.preloadReplicaNum, err = extractUintWithDefault(r, replicaNumKey, 1); err != nil {
return
}
if preload.preloadReplicaNum < 1 || preload.preloadReplicaNum > 16 {
return fmt.Errorf("preload replicaNum must be between [%d] to [%d], now[%d]", 1, 16, preload.preloadReplicaNum)
}
return
}
func (m *Server) createPreLoadDataPartition(w http.ResponseWriter, r *http.Request) {
var (
volName string
vol *Vol
err error
dps []*DataPartition
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreatePreLoadDataPartition))
defer func() {
doStatAndMetric(proto.AdminCreatePreLoadDataPartition, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
log.LogInfof("action[createPreLoadDataPartition]")
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if !proto.IsCold(vol.VolType) {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("only low frequency volume can create preloadDp")))
return
}
preload := new(DataPartitionPreLoad)
err = parsePreloadDpReq(r, preload)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
total := vol.CalculatePreloadCapacity() + uint64(preload.preloadCacheCapacity)
if total > vol.CacheCapacity {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("preload total capacity[%d] can't be bigger than cache capacity [%d]",
total, vol.CacheCapacity)))
return
}
log.LogInfof("[createPreLoadDataPartition] start create preload dataPartition, vol(%s), req(%s)", volName, preload.toString())
err, dps = m.cluster.batchCreatePreLoadDataPartition(vol, preload)
if err != nil {
log.LogErrorf("create data partition fail: volume(%v), req(%v) err(%v)", volName, preload.toString(), err)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if len(dps) == 0 {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("create zero datapartition")))
return
}
cv := proto.NewDataPartitionsView()
dpResps := make([]*proto.DataPartitionResponse, 0)
for _, dp := range dps {
dpResp := dp.convertToDataPartitionResponse()
dpResps = append(dpResps, dpResp)
}
log.LogDebugf("action[createPreLoadDataPartition] dps cnt[%v] content[%v]", len(dps), dpResps)
cv.DataPartitions = dpResps
sendOkReply(w, r, newSuccessHTTPReply(cv))
}
func (m *Server) getQosStatus(w http.ResponseWriter, r *http.Request) {
var (
volName string
err error
vol *Vol
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosGetStatus))
defer func() {
doStatAndMetric(proto.QosGetStatus, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
sendOkReply(w, r, newSuccessHTTPReply(vol.getQosStatus(m.cluster)))
}
func (m *Server) getClientQosInfo(w http.ResponseWriter, r *http.Request) {
var (
volName string
err error
vol *Vol
host string
id uint64
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosGetClientsLimitInfo))
defer func() {
doStatAndMetric(proto.QosGetClientsLimitInfo, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if host = r.FormValue(addrKey); host != "" {
log.LogInfof("action[getClientQosInfo] host %v", host)
if !checkIp(host) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
return
}
}
if value := r.FormValue(idKey); value != "" {
if id, err = strconv.ParseUint(value, 10, 64); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
var rsp interface{}
if rsp, err = vol.getClientLimitInfo(id, host); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
} else {
sendOkReply(w, r, newSuccessHTTPReply(rsp))
}
}
func (m *Server) getQosUpdateMasterLimit(w http.ResponseWriter, r *http.Request) {
var (
err error
value string
limit uint64
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdateMasterLimit))
defer func() {
doStatAndMetric(proto.QosUpdateMasterLimit, metric, err, nil)
}()
if value = r.FormValue(QosMasterLimit); value != "" {
if limit, err = strconv.ParseUint(value, 10, 64); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("wrong param of limit")))
return
}
if limit < QosMasterAcceptCnt {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("limit too less than %v", QosMasterAcceptCnt)))
return
}
m.cluster.cfg.QosMasterAcceptLimit = limit
m.cluster.QosAcceptLimit.SetLimit(rate.Limit(limit))
if err = m.cluster.syncPutCluster(); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
return
}
sendOkReply(w, r, newSuccessHTTPReply("success"))
return
}
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("no param of limit")))
}
func (m *Server) QosUpdateClientParam(w http.ResponseWriter, r *http.Request) {
var (
volName string
value string
parsed uint64
period, triggerCnt uint32
err error
vol *Vol
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdateClientParam))
defer func() {
doStatAndMetric(proto.QosUpdateClientParam, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if value = r.FormValue(ClientReqPeriod); value != "" {
if parsed, err = strconv.ParseUint(value, 10, 32); err != nil || parsed == 0 {
log.LogErrorf("hytemp error %v", err)
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("wrong param of peroid")))
return
}
period = uint32(parsed)
}
if value = r.FormValue(ClientTriggerCnt); value != "" {
if parsed, err = strconv.ParseUint(value, 10, 32); err != nil || parsed == 0 {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("wrong param of triggerCnt")))
return
}
triggerCnt = uint32(parsed)
}
if err = vol.updateClientParam(m.cluster, period, triggerCnt); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
func parseRequestQos(r *http.Request, isMagnify bool, isEnableIops bool) (qosParam *qosArgs, err error) {
qosParam = &qosArgs{}
var value int
var flowFmt int
if isMagnify {
flowFmt = 1
} else {
flowFmt = util.MB
}
if qosEnableStr := r.FormValue(QosEnableKey); qosEnableStr != "" {
qosParam.qosEnable, _ = strconv.ParseBool(qosEnableStr)
}
if isEnableIops {
if iopsRLimitStr := r.FormValue(IopsRKey); iopsRLimitStr != "" {
log.LogInfof("actin[parseRequestQos] iopsRLimitStr %v", iopsRLimitStr)
if value, err = strconv.Atoi(iopsRLimitStr); err == nil {
qosParam.iopsRVal = uint64(value)
if !isMagnify && qosParam.iopsRVal < MinIoLimit {
err = fmt.Errorf("iops read %v need larger than 100", value)
return
}
if isMagnify && (qosParam.iopsRVal < MinMagnify || qosParam.iopsRVal > MaxMagnify) {
err = fmt.Errorf("iops read magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
return
}
}
}
if iopsWLimitStr := r.FormValue(IopsWKey); iopsWLimitStr != "" {
log.LogInfof("actin[parseRequestQos] iopsWLimitStr %v", iopsWLimitStr)
if value, err = strconv.Atoi(iopsWLimitStr); err == nil {
qosParam.iopsWVal = uint64(value)
if !isMagnify && qosParam.iopsWVal < MinIoLimit {
err = fmt.Errorf("iops %v write write io larger than 100", value)
return
}
if isMagnify && (qosParam.iopsWVal < MinMagnify || qosParam.iopsWVal > MaxMagnify) {
err = fmt.Errorf("iops write magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
return
}
}
}
}
if flowRLimitStr := r.FormValue(FlowRKey); flowRLimitStr != "" {
log.LogInfof("actin[parseRequestQos] flowRLimitStr %v", flowRLimitStr)
if value, err = strconv.Atoi(flowRLimitStr); err == nil {
qosParam.flowRVal = uint64(value * flowFmt)
if !isMagnify && (qosParam.flowRVal < MinFlowLimit || qosParam.flowRVal > MaxFlowLimit) {
err = fmt.Errorf("flow read %v should be between 100M and 10TB ", value)
return
}
if isMagnify && (qosParam.flowRVal < MinMagnify || qosParam.flowRVal > MaxMagnify) {
err = fmt.Errorf("flow read magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
return
}
}
}
if flowWLimitStr := r.FormValue(FlowWKey); flowWLimitStr != "" {
log.LogInfof("actin[parseRequestQos] flowWLimitStr %v", flowWLimitStr)
if value, err = strconv.Atoi(flowWLimitStr); err == nil {
qosParam.flowWVal = uint64(value * flowFmt)
if !isMagnify && (qosParam.flowWVal < MinFlowLimit || qosParam.flowWVal > MaxFlowLimit) {
err = fmt.Errorf("flow write %v should be between 100M and 10TB", value)
log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
return
}
if isMagnify && (qosParam.flowWVal < MinMagnify || qosParam.flowWVal > MaxMagnify) {
err = fmt.Errorf("flow write magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
return
}
}
}
log.LogInfof("action[parseRequestQos] result %v", qosParam)
return
}
// flowRVal, flowWVal take MB as unit
func (m *Server) QosUpdateZoneLimit(w http.ResponseWriter, r *http.Request) {
var (
value interface{}
ok bool
err error
qosParam *qosArgs
enable bool
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdateZoneLimit))
defer func() {
doStatAndMetric(proto.QosUpdateZoneLimit, metric, err, nil)
}()
var zoneName string
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
zoneName = DefaultZoneName
}
if qosParam, err = parseRequestQos(r, false, true); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if sVal := r.FormValue(DiskEnableKey); sVal != "" {
if enable, err = strconv.ParseBool(sVal); err == nil {
log.LogInfof("action[DiskQosUpdate] enable be set [%v]", enable)
m.cluster.diskQosEnable = enable
err = m.cluster.syncPutCluster()
}
}
if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("zonename [%v] not found", zoneName)))
return
}
zone := value.(*Zone)
zone.updateDataNodeQosLimit(m.cluster, qosParam)
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
// flowRVal, flowWVal take MB as unit
func (m *Server) QosGetZoneLimit(w http.ResponseWriter, r *http.Request) {
var (
value interface{}
ok bool
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosGetZoneLimitInfo))
defer func() {
doStatAndMetric(proto.QosGetZoneLimitInfo, metric, nil, nil)
}()
var zoneName string
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
zoneName = DefaultZoneName
}
if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("zonename [%v] not found", zoneName)))
return
}
zone := value.(*Zone)
type qosZoneStatus struct {
Zone string
DiskLimitEnable bool
IopsRVal uint64
IopsWVal uint64
FlowRVal uint64
FlowWVal uint64
}
zoneSt := &qosZoneStatus{
Zone: zoneName,
DiskLimitEnable: m.cluster.diskQosEnable,
IopsRVal: zone.QosIopsRLimit,
IopsWVal: zone.QosIopsWLimit,
FlowRVal: zone.QosFlowRLimit,
FlowWVal: zone.QosFlowWLimit,
}
sendOkReply(w, r, newSuccessHTTPReply(zoneSt))
}
func (m *Server) QosUpdate(w http.ResponseWriter, r *http.Request) {
var (
volName string
err error
vol *Vol
enable bool
value string
limitArgs *qosArgs
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdate))
defer func() {
doStatAndMetric(proto.QosUpdate, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, err = extractName(r); err == nil {
if vol, err = m.cluster.getVol(volName); err != nil {
goto RET
}
if value = r.FormValue(QosEnableKey); value != "" {
if enable, err = strconv.ParseBool(value); err != nil {
goto RET
}
if err = vol.volQosEnable(m.cluster, enable); err != nil {
goto RET
}
log.LogInfof("action[DiskQosUpdate] update qos eanble [%v]", enable)
}
if limitArgs, err = parseRequestQos(r, false, false); err == nil && limitArgs.isArgsWork() {
if err = vol.volQosUpdateLimit(m.cluster, limitArgs); err != nil {
goto RET
}
log.LogInfof("action[DiskQosUpdate] update qos limit [%v] [%v] [%v] [%v] [%v]", enable,
limitArgs.iopsRVal, limitArgs.iopsWVal, limitArgs.flowRVal, limitArgs.flowWVal)
}
}
RET:
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
_ = sendOkReply(w, r, newSuccessHTTPReply("success"))
return
}
func (m *Server) createDataPartition(w http.ResponseWriter, r *http.Request) {
var (
rstMsg string
volName string
vol *Vol
reqCreateCount int
lastTotalDataPartitions int
clusterTotalDataPartitions int
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreateDataPartition))
defer func() {
doStatAndMetric(proto.AdminCreateDataPartition, metric, err, map[string]string{exporter.Vol: volName})
}()
if reqCreateCount, volName, err = parseRequestToCreateDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if proto.IsCold(vol.VolType) {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("low frequency vol can't create dp")))
return
}
lastTotalDataPartitions = len(vol.dataPartitions.partitions)
clusterTotalDataPartitions = m.cluster.getDataPartitionCount()
err = m.cluster.batchCreateDataPartition(vol, reqCreateCount, false)
rstMsg = fmt.Sprintf(" createDataPartition succeeeds. "+
"clusterLastTotalDataPartitions[%v],vol[%v] has %v data partitions previously and %v data partitions now",
clusterTotalDataPartitions, volName, lastTotalDataPartitions, len(vol.dataPartitions.partitions))
if err != nil {
log.LogErrorf("create data partition fail: volume(%v) err(%v)", volName, err)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
_ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) changeDataPartitionLeader(w http.ResponseWriter, r *http.Request) {
var (
dp *DataPartition
partitionID uint64
err error
host string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDataPartitionChangeLeader))
defer func() {
doStatAndMetric(proto.AdminDataPartitionChangeLeader, metric, err, nil)
}()
if partitionID, _, err = parseRequestToGetDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
if host = r.FormValue(addrKey); host == "" {
err = keyNotFound(addrKey)
return
}
if !checkIp(host) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
return
}
if err = dp.tryToChangeLeaderByHost(host); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
rstMsg := fmt.Sprintf(" changeDataPartitionLeader command success send to dest host but need check. ")
_ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) getDataPartition(w http.ResponseWriter, r *http.Request) {
var (
dp *DataPartition
partitionID uint64
volName string
vol *Vol
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetDataPartition))
defer func() {
doStatAndMetric(proto.AdminGetDataPartition, metric, err, map[string]string{exporter.Vol: volName})
}()
if partitionID, volName, err = parseRequestToGetDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if volName != "" {
if vol, err = m.cluster.getVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
if dp, err = vol.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
} else {
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
}
sendOkReply(w, r, newSuccessHTTPReply(dp.buildDpInfo(m.cluster)))
}
// Load the data partition.
func (m *Server) loadDataPartition(w http.ResponseWriter, r *http.Request) {
var (
msg string
dp *DataPartition
partitionID uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminLoadDataPartition))
defer func() {
doStatAndMetric(proto.AdminLoadDataPartition, metric, err, nil)
}()
if partitionID, err = parseRequestToLoadDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
m.cluster.loadDataPartition(dp)
msg = fmt.Sprintf(proto.AdminLoadDataPartition+"partitionID :%v load data partition successfully", partitionID)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) addDataReplica(w http.ResponseWriter, r *http.Request) {
var (
msg string
addr string
dp *DataPartition
partitionID uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminAddDataReplica))
defer func() {
doStatAndMetric(proto.AdminAddDataReplica, metric, err, nil)
}()
if partitionID, addr, err = parseRequestToAddDataReplica(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
if err = m.cluster.addDataReplica(dp, addr); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
dp.Status = proto.ReadOnly
dp.isRecover = true
m.cluster.putBadDataPartitionIDs(nil, addr, dp.PartitionID)
msg = fmt.Sprintf("data partitionID :%v add replica [%v] successfully", partitionID, addr)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) deleteDataReplica(w http.ResponseWriter, r *http.Request) {
var (
msg string
addr string
dp *DataPartition
partitionID uint64
err error
force bool // now only used in two replicas in the scenario of no leader
raftForce bool
vol *Vol
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDeleteDataReplica))
defer func() {
doStatAndMetric(proto.AdminDeleteDataReplica, metric, err, nil)
}()
if partitionID, addr, err = parseRequestToRemoveDataReplica(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
// force only be used in scenario that dp of two replicas volume no leader caused by one replica crash
raftForce, err = parseRaftForce(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
force, err = pareseBoolWithDefault(r, forceKey, false)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.removeDataReplica(dp, addr, !force, raftForce); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if vol, err = m.cluster.getVol(dp.VolName); err != nil {
log.LogErrorf("action[updateVol] err[%v]", err)
err = proto.ErrVolNotExists
sendErrReply(w, r, newErrHTTPReply(err))
return
}
_ = vol.tryUpdateDpReplicaNum(m.cluster, dp)
msg = fmt.Sprintf("data partitionID :%v delete replica [%v] successfully", partitionID, addr)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) addMetaReplica(w http.ResponseWriter, r *http.Request) {
var (
msg string
addr string
mp *MetaPartition
partitionID uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminAddMetaReplica))
defer func() {
doStatAndMetric(proto.AdminAddMetaReplica, metric, err, nil)
}()
if partitionID, addr, err = parseRequestToAddMetaReplica(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
return
}
if err = m.cluster.addMetaReplica(mp, addr); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
mp.IsRecover = true
m.cluster.putBadMetaPartitions(addr, mp.PartitionID)
msg = fmt.Sprintf("meta partitionID :%v add replica [%v] successfully", partitionID, addr)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) deleteMetaReplica(w http.ResponseWriter, r *http.Request) {
var (
msg string
addr string
mp *MetaPartition
partitionID uint64
err error
force bool
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDeleteMetaReplica))
defer func() {
doStatAndMetric(proto.AdminDeleteMetaReplica, metric, err, nil)
}()
if partitionID, addr, err = parseRequestToRemoveMetaReplica(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
return
}
var value string
if value = r.FormValue(forceKey); value != "" {
force, _ = strconv.ParseBool(value)
}
if err = m.cluster.deleteMetaReplica(mp, addr, true, force); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf("meta partitionID :%v delete replica [%v] successfully", partitionID, addr)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) changeMetaPartitionLeader(w http.ResponseWriter, r *http.Request) {
var (
mp *MetaPartition
partitionID uint64
err error
host string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminChangeMetaPartitionLeader))
defer func() {
doStatAndMetric(proto.AdminChangeMetaPartitionLeader, metric, err, nil)
}()
if partitionID, _, err = parseRequestToGetDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
log.LogErrorf("changeMetaPartitionLeader.err %v", err)
return
}
if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
log.LogErrorf("changeMetaPartitionLeader.err %v", proto.ErrMetaPartitionNotExists)
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
return
}
if host = r.FormValue(addrKey); host == "" {
err = keyNotFound(addrKey)
log.LogErrorf("changeMetaPartitionLeader.err %v", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !checkIp(host) {
log.LogErrorf("changeMetaPartitionLeader.err addr not legal")
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
return
}
if err = mp.tryToChangeLeaderByHost(host); err != nil {
log.LogErrorf("changeMetaPartitionLeader.err %v", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
rstMsg := fmt.Sprintf(" changeMetaPartitionLeader command success send to dest host but need check. ")
_ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
// balance the leader meta partition in metaNodes which can select all cluster some zones or noteSet
func (m *Server) balanceMetaPartitionLeader(w http.ResponseWriter, r *http.Request) {
var (
zonesKey string
nodesetIdKey string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminBalanceMetaPartitionLeader))
defer func() {
doStatAndMetric(proto.AdminBalanceMetaPartitionLeader, metric, err, nil)
}()
if zonesKey, nodesetIdKey, err = parseRequestToBalanceMetaPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
log.LogErrorf("balanceMetaPartitionLeader.err %v", err)
return
}
log.LogInfof("zone:%v,nodesetId:%v", zonesKey, nodesetIdKey)
zonesM := make(map[string]struct{})
if zonesKey != "" {
zones := strings.Split(zonesKey, commaSplit)
for _, zone := range zones {
zonesM[zone] = struct{}{}
}
}
nodesetIdM := make(map[uint64]struct{})
if nodesetIdKey != "" {
nodesetIds := strings.Split(nodesetIdKey, commaSplit)
for _, nodeSetId := range nodesetIds {
id, err := strconv.ParseUint(nodeSetId, 10, 64)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
log.LogErrorf("balanceMetaPartitionLeader.err %v", err)
return
}
nodesetIdM[id] = struct{}{}
}
}
log.LogInfof("balanceMetaPartitionLeader zones[%v] length[%d], nodesetIds[%v] length[%d]", zonesKey, len(zonesM), nodesetIdKey, len(nodesetIdM))
err = m.cluster.balanceMetaPartitionLeader(zonesM, nodesetIdM)
if err != nil {
log.LogErrorf("balanceMetaPartitionLeader.err %v", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
rstMsg := fmt.Sprintf("balanceMetaPartitionLeader command sucess")
_ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
// Decommission a data partition. This usually happens when disk error has been reported.
// This function needs to be called manually by the admin.
func (m *Server) decommissionDataPartition(w http.ResponseWriter, r *http.Request) {
var (
rstMsg string
dp *DataPartition
addr string
partitionID uint64
raftForce bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDecommissionDataPartition))
defer func() {
doStatAndMetric(proto.AdminDecommissionDataPartition, metric, err, nil)
}()
if partitionID, addr, err = parseRequestToDecommissionDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
raftForce, err = parseRaftForce(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !dp.IsDecommissionInitial() {
rstMsg = fmt.Sprintf(" dataPartitionID :%v status %v not support decommission",
partitionID, dp.GetDecommissionStatus())
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: rstMsg})
return
}
if dp.isSpecialReplicaCnt() {
rstMsg = fmt.Sprintf(proto.AdminDecommissionDataPartition+" dataPartitionID :%v is special replica cnt %v on node:%v async running,need check later",
partitionID, dp.ReplicaNum, addr)
go m.cluster.decommissionDataPartition(addr, dp, raftForce, handleDataPartitionOfflineErr)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
return
}
if err = m.cluster.decommissionDataPartition(addr, dp, raftForce, handleDataPartitionOfflineErr); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if !dp.isSpecialReplicaCnt() {
rstMsg = fmt.Sprintf(proto.AdminDecommissionDataPartition+" dataPartitionID :%v on node:%v successfully", partitionID, addr)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
}
func (m *Server) diagnoseDataPartition(w http.ResponseWriter, r *http.Request) {
var (
err error
rstMsg *proto.DataPartitionDiagnosis
inactiveNodes []string
corruptDps []*DataPartition
lackReplicaDps []*DataPartition
badReplicaDps []*DataPartition
repFileCountDifferDps []*DataPartition
repUsedSizeDifferDps []*DataPartition
excessReplicaDPs []*DataPartition
corruptDpIDs []uint64
lackReplicaDpIDs []uint64
badReplicaDpIDs []uint64
repFileCountDifferDpIDs []uint64
repUsedSizeDifferDpIDs []uint64
excessReplicaDpIDs []uint64
badDataPartitionInfos []proto.BadPartitionRepairView
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDiagnoseDataPartition))
defer func() {
doStatAndMetric(proto.AdminDiagnoseDataPartition, metric, err, nil)
}()
ignoreDiscardDp, err := pareseBoolWithDefault(r, ignoreDiscardKey, false)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
corruptDpIDs = make([]uint64, 0)
lackReplicaDpIDs = make([]uint64, 0)
badReplicaDpIDs = make([]uint64, 0)
repFileCountDifferDpIDs = make([]uint64, 0)
repUsedSizeDifferDpIDs = make([]uint64, 0)
excessReplicaDpIDs = make([]uint64, 0)
if inactiveNodes, err = m.cluster.checkInactiveDataNodes(); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if lackReplicaDps, badReplicaDps, repFileCountDifferDps, repUsedSizeDifferDps, excessReplicaDPs, corruptDps, err = m.cluster.checkReplicaOfDataPartitions(ignoreDiscardDp); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
for _, dp := range corruptDps {
corruptDpIDs = append(corruptDpIDs, dp.PartitionID)
}
for _, dp := range lackReplicaDps {
lackReplicaDpIDs = append(lackReplicaDpIDs, dp.PartitionID)
}
for _, dp := range badReplicaDps {
badReplicaDpIDs = append(badReplicaDpIDs, dp.PartitionID)
}
for _, dp := range repFileCountDifferDps {
repFileCountDifferDpIDs = append(repFileCountDifferDpIDs, dp.PartitionID)
}
for _, dp := range repUsedSizeDifferDps {
repUsedSizeDifferDpIDs = append(repUsedSizeDifferDpIDs, dp.PartitionID)
}
for _, dp := range excessReplicaDPs {
excessReplicaDpIDs = append(excessReplicaDpIDs, dp.PartitionID)
}
// badDataPartitions = m.cluster.getBadDataPartitionsView()
badDataPartitionInfos = m.cluster.getBadDataPartitionsRepairView()
rstMsg = &proto.DataPartitionDiagnosis{
InactiveDataNodes: inactiveNodes,
CorruptDataPartitionIDs: corruptDpIDs,
LackReplicaDataPartitionIDs: lackReplicaDpIDs,
BadDataPartitionInfos: badDataPartitionInfos,
BadReplicaDataPartitionIDs: badReplicaDpIDs,
RepFileCountDifferDpIDs: repFileCountDifferDpIDs,
RepUsedSizeDifferDpIDs: repUsedSizeDifferDpIDs,
ExcessReplicaDpIDs: excessReplicaDpIDs,
}
log.LogInfof("diagnose dataPartition[%v] inactiveNodes:[%v], corruptDpIDs:[%v], "+
"lackReplicaDpIDs:[%v], BadReplicaDataPartitionIDs[%v], "+
"repFileCountDifferDpIDs:[%v], RepUsedSizeDifferDpIDs[%v], excessReplicaDpIDs[%v]",
m.cluster.Name, inactiveNodes, corruptDpIDs,
lackReplicaDpIDs, badReplicaDpIDs,
repFileCountDifferDpIDs, repUsedSizeDifferDpIDs, excessReplicaDpIDs)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) resetDataPartitionDecommissionStatus(w http.ResponseWriter, r *http.Request) {
var (
msg string
dp *DataPartition
partitionID uint64
err error
)
if partitionID, err = parseRequestToLoadDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
dp.ResetDecommissionStatus()
msg = fmt.Sprintf("partitionID :%v reset decommission status successfully", partitionID)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) queryDataPartitionDecommissionStatus(w http.ResponseWriter, r *http.Request) {
var (
msg string
dp *DataPartition
partitionID uint64
err error
replicas []string
)
if partitionID, err = parseRequestToLoadDataPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
return
}
for _, replica := range dp.Replicas {
replicas = append(replicas, replica.Addr)
}
msg = fmt.Sprintf("partitionID:%v status[%v] specialStep[%v] retry [%v] raftForce[%v] recover [%v] "+
"decommission src dataNode[%v] disk[%v] dst dataNode[%v] term[%v] replicas[%v] DecommissionWaitTimes[%v]",
partitionID, dp.GetDecommissionStatus(), dp.GetSpecialReplicaDecommissionStep(), dp.DecommissionRetry, dp.DecommissionRaftForce, dp.isRecover,
dp.DecommissionSrcAddr, dp.DecommissionSrcDiskPath, dp.DecommissionDstAddr, dp.DecommissionTerm, replicas, dp.DecommissionWaitTimes)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
// Mark the volume as deleted, which will then be deleted later.
func (m *Server) markDeleteVol(w http.ResponseWriter, r *http.Request) {
var (
name string
authKey string
// force bool
err error
msg string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDeleteVol))
defer func() {
doStatAndMetric(proto.AdminDeleteVol, metric, err, map[string]string{exporter.Vol: name})
}()
if name, authKey, _, err = parseRequestToDeleteVol(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.markDeleteVol(name, authKey, false); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.user.deleteVolPolicy(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf("delete vol[%v] successfully,from[%v]", name, r.RemoteAddr)
log.LogWarn(msg)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) checkReplicaNum(r *http.Request, vol *Vol, req *updateVolReq) (err error) {
var (
replicaNumInt64 int64
replicaNum int
)
if replicaNumStr := r.FormValue(replicaNumKey); replicaNumStr != "" {
if replicaNumInt64, err = strconv.ParseInt(replicaNumStr, 10, 8); err != nil {
err = unmatchedKey(replicaNumKey)
return
}
replicaNum = int(replicaNumInt64)
} else {
replicaNum = int(vol.dpReplicaNum)
}
req.replicaNum = replicaNum
if replicaNum != 0 && replicaNum != int(vol.dpReplicaNum) {
if replicaNum != int(vol.dpReplicaNum)-1 {
err = fmt.Errorf("replicaNum only need be reduced one replica one time")
return
}
if !proto.IsHot(vol.VolType) {
err = fmt.Errorf("vol type(%v) replicaNum cann't be changed", vol.VolType)
return
}
if ok, dpArry := vol.isOkUpdateRepCnt(); !ok {
err = fmt.Errorf("vol have dataPartitions[%v] with inconsistent dataPartitions cnt to volume's ", dpArry)
return
}
}
if proto.IsHot(vol.VolType) {
if req.replicaNum == 0 ||
((req.replicaNum == 1 || req.replicaNum == 2) && !req.followerRead) {
err = fmt.Errorf("replica or follower read status error")
return
}
} else {
if req.replicaNum == 0 && req.coldArgs.cacheCap > 0 {
req.replicaNum = 1
}
if (req.replicaNum == 0 && req.replicaNum != int(vol.dpReplicaNum)) || !req.followerRead {
err = fmt.Errorf("replica or follower read status error")
return
}
}
return
}
func (m *Server) updateVol(w http.ResponseWriter, r *http.Request) {
var (
req = &updateVolReq{}
vol *Vol
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateVol))
defer func() {
doStatAndMetric(proto.AdminUpdateVol, metric, err, map[string]string{exporter.Vol: req.name})
}()
if req.name, err = parseVolName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(req.name); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
if err = parseVolUpdateReq(r, vol, req); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if req.followerRead, req.authenticate, err = parseBoolFieldToUpdateVol(r, vol); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.checkReplicaNum(r, vol, req); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
newArgs := getVolVarargs(vol)
newArgs.zoneName = req.zoneName
newArgs.description = req.description
newArgs.capacity = req.capacity
newArgs.deleteLockTime = req.deleteLockTime
newArgs.followerRead = req.followerRead
newArgs.authenticate = req.authenticate
newArgs.dpSelectorName = req.dpSelectorName
newArgs.dpSelectorParm = req.dpSelectorParm
newArgs.enablePosixAcl = req.enablePosixAcl
newArgs.enableTransaction = req.enableTransaction
newArgs.txTimeout = req.txTimeout
newArgs.txConflictRetryNum = req.txConflictRetryNum
newArgs.txConflictRetryInterval = req.txConflictRetryInterval
newArgs.txOpLimit = req.txOpLimit
newArgs.enableQuota = req.enableQuota
if req.coldArgs != nil {
newArgs.coldArgs = req.coldArgs
}
newArgs.dpReplicaNum = uint8(req.replicaNum)
newArgs.dpReadOnlyWhenVolFull = req.dpReadOnlyWhenVolFull
log.LogWarnf("[updateVolOut] name [%s], z1 [%s], z2[%s] replicaNum[%v]", req.name, req.zoneName, vol.Name, req.replicaNum)
if err = m.cluster.updateVol(req.name, req.authKey, newArgs); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
var response string
if hasTxParams(r) {
response = fmt.Sprintf("update vol[%v] successfully, txTimeout[%v] enableTransaction[%v]",
req.name, newArgs.txTimeout, proto.GetMaskString(newArgs.enableTransaction))
} else {
response = fmt.Sprintf("update vol[%v] successfully", req.name)
}
sendOkReply(w, r, newSuccessHTTPReply(response))
}
func (m *Server) volExpand(w http.ResponseWriter, r *http.Request) {
var (
name string
authKey string
err error
msg string
capacity int
vol *Vol
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolExpand))
defer func() {
doStatAndMetric(proto.AdminVolExpand, metric, err, map[string]string{exporter.Vol: name})
}()
if name, authKey, capacity, err = parseRequestToSetVolCapacity(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
if uint64(capacity) <= vol.Capacity {
err = fmt.Errorf("expand capacity[%v] should be larger than the old capacity[%v]", capacity, vol.Capacity)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
newArgs := getVolVarargs(vol)
newArgs.capacity = uint64(capacity)
if err = m.cluster.updateVol(name, authKey, newArgs); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf("update vol[%v] successfully\n", name)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) volShrink(w http.ResponseWriter, r *http.Request) {
var (
name string
authKey string
err error
msg string
capacity int
vol *Vol
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolShrink))
defer func() {
doStatAndMetric(proto.AdminVolShrink, metric, err, map[string]string{exporter.Vol: name})
}()
if name, authKey, capacity, err = parseRequestToSetVolCapacity(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
if uint64(capacity) >= vol.Capacity {
err = fmt.Errorf("shrink capacity[%v] should be less than the old capacity[%v]", capacity, vol.Capacity)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
newArgs := getVolVarargs(vol)
newArgs.capacity = uint64(capacity)
if err = m.cluster.updateVol(name, authKey, newArgs); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf("update vol[%v] successfully\n", name)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) checkCreateReq(req *createVolReq) (err error) {
if !proto.IsHot(req.volType) && !proto.IsCold(req.volType) {
return fmt.Errorf("vol type %d is illegal", req.volType)
}
if req.capacity == 0 {
return fmt.Errorf("vol capacity can't be zero, %d", req.capacity)
}
if req.dpSize != 0 && req.dpSize <= 10 {
return fmt.Errorf("datapartition dpSize must be bigger than 10 G")
}
if req.dpCount > maxInitDataPartitionCnt {
return fmt.Errorf("dpCount[%d] exceeds maximum limit[%d]", req.dpCount, maxInitDataPartitionCnt)
}
if proto.IsHot(req.volType) {
if req.dpReplicaNum == 0 {
req.dpReplicaNum = defaultReplicaNum
}
if req.dpReplicaNum > 3 {
return fmt.Errorf("hot vol's replicaNum should be 1 to 3, received replicaNum is[%v]", req.dpReplicaNum)
}
return nil
} else if proto.IsCold(req.volType) {
if req.dpReplicaNum > 16 {
return fmt.Errorf("cold vol's replicaNum should less then 17, received replicaNum is[%v]", req.dpReplicaNum)
}
}
if req.dpReplicaNum == 0 && req.coldArgs.cacheCap > 0 {
req.dpReplicaNum = 1
}
req.followerRead = true
args := req.coldArgs
if args.objBlockSize == 0 {
args.objBlockSize = defaultEbsBlkSize
}
if err = checkCacheAction(args.cacheAction); err != nil {
return
}
if args.cacheTtl == 0 {
args.cacheTtl = defaultCacheTtl
}
if args.cacheThreshold == 0 {
args.cacheThreshold = defaultCacheThreshold
}
if args.cacheHighWater == 0 {
args.cacheHighWater = defaultCacheHighWater
}
if args.cacheLowWater == 0 {
args.cacheLowWater = defaultCacheLowWater
}
if args.cacheLRUInterval != 0 && args.cacheLRUInterval < 2 {
return fmt.Errorf("cache lruInterval(%d) must bigger than 2 minutes", args.cacheLRUInterval)
}
if args.cacheLRUInterval == 0 {
args.cacheLRUInterval = defaultCacheLruInterval
}
if args.cacheLowWater >= args.cacheHighWater {
return fmt.Errorf("low water(%d) must be less than high water(%d)", args.cacheLowWater, args.cacheHighWater)
}
if args.cacheCap >= uint64(req.capacity) {
return fmt.Errorf("cache capacity(%d) must be less than capacity(%d)", args.cacheCap, req.capacity)
}
if proto.IsCold(req.volType) && req.dpReplicaNum == 0 && args.cacheCap > 0 {
return fmt.Errorf("cache capacity(%d) not zero,replicaNum should not be zero", args.cacheCap)
}
if args.cacheHighWater >= 90 || args.cacheLowWater >= 90 {
return fmt.Errorf("low(%d) or high water(%d) can't be large than 90, low than 0", args.cacheLowWater, args.cacheHighWater)
}
if int(req.dpReplicaNum) > m.cluster.dataNodeCount() {
return fmt.Errorf("dp replicaNum %d can't be large than dataNodeCnt %d", req.dpReplicaNum, m.cluster.dataNodeCount())
}
req.coldArgs = args
return nil
}
func (m *Server) createVol(w http.ResponseWriter, r *http.Request) {
req := &createVolReq{}
vol := &Vol{}
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreateVol))
defer func() {
doStatAndMetric(proto.AdminCreateVol, metric, err, map[string]string{exporter.Vol: req.name})
}()
if err = parseRequestToCreateVol(r, req); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.checkCreateReq(req); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if proto.IsHot(req.volType) && (req.dpReplicaNum == 1 || req.dpReplicaNum == 2) && !req.followerRead {
err = fmt.Errorf("hot volume replicaNum be 2 and 3,followerRead must set true")
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.createVol(req); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.associateVolWithUser(req.owner, req.name); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg := fmt.Sprintf("create vol[%v] successfully, has allocate [%v] data partitions", req.name, len(vol.dataPartitions.partitions))
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) qosUpload(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
vol *Vol
limit *proto.LimitRsp2Client
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpload))
defer func() {
doStatAndMetric(proto.QosUpload, metric, err, map[string]string{exporter.Vol: name})
}()
ctx := context.Background()
m.cluster.QosAcceptLimit.WaitN(ctx, 1)
log.LogInfof("action[qosUpload] limit %v", m.cluster.QosAcceptLimit.Limit())
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
qosEnableStr := r.FormValue(QosEnableKey)
if qosEnableStr == "" {
sendErrReply(w, r, newErrHTTPReply(proto.ErrParamError))
return
}
// qos upload may called by client init,thus use qosEnable param to identify it weather need to calc by master
var clientInfo *proto.ClientReportLimitInfo
if qosEnable, _ := strconv.ParseBool(qosEnableStr); qosEnable {
if clientInfo, err = parseQosInfo(r); err == nil {
log.LogDebugf("action[qosUpload] cliInfoMgrMap [%v],clientInfo id[%v] clientInfo.Host %v, enable %v", clientInfo.ID, clientInfo.Host, r.RemoteAddr, qosEnable)
if clientInfo.ID == 0 {
if limit, err = vol.qosManager.init(m.cluster, clientInfo.Host); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
clientInfo.ID = limit.ID
}
if limit, err = vol.qosManager.HandleClientQosReq(clientInfo, clientInfo.ID); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
} else {
log.LogInfof("action[qosUpload] qosEnableStr:[%v] err [%v]", qosEnableStr, err)
}
}
sendOkReply(w, r, newSuccessHTTPReply(limit))
}
func (m *Server) getVolSimpleInfo(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
vol *Vol
)
metric := exporter.NewTPCnt("req" + strings.Replace(proto.AdminGetVol, "/", "_", -1))
defer func() {
doStatAndMetric(proto.AdminGetVol, metric, err, map[string]string{exporter.Vol: name})
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
volView := newSimpleView(vol)
sendOkReply(w, r, newSuccessHTTPReply(volView))
}
func newSimpleView(vol *Vol) (view *proto.SimpleVolView) {
var (
volInodeCount uint64
volDentryCount uint64
)
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
volDentryCount = volDentryCount + mp.DentryCount
volInodeCount = volInodeCount + mp.InodeCount
}
vol.mpsLock.RUnlock()
maxPartitionID := vol.maxPartitionID()
view = &proto.SimpleVolView{
ID: vol.ID,
Name: vol.Name,
Owner: vol.Owner,
ZoneName: vol.zoneName,
DpReplicaNum: vol.dpReplicaNum,
MpReplicaNum: vol.mpReplicaNum,
InodeCount: volInodeCount,
DentryCount: volDentryCount,
MaxMetaPartitionID: maxPartitionID,
Status: vol.Status,
Capacity: vol.Capacity,
FollowerRead: vol.FollowerRead,
EnablePosixAcl: vol.enablePosixAcl,
EnableQuota: vol.enableQuota,
EnableTransaction: proto.GetMaskString(vol.enableTransaction),
TxTimeout: vol.txTimeout,
TxConflictRetryNum: vol.txConflictRetryNum,
TxConflictRetryInterval: vol.txConflictRetryInterval,
TxOpLimit: vol.txOpLimit,
NeedToLowerReplica: vol.NeedToLowerReplica,
Authenticate: vol.authenticate,
CrossZone: vol.crossZone,
DefaultPriority: vol.defaultPriority,
DomainOn: vol.domainOn,
RwDpCnt: vol.dataPartitions.readableAndWritableCnt,
MpCnt: len(vol.MetaPartitions),
DpCnt: len(vol.dataPartitions.partitionMap),
CreateTime: time.Unix(vol.createTime, 0).Format(proto.TimeFormat),
DeleteLockTime: vol.DeleteLockTime,
Description: vol.description,
DpSelectorName: vol.dpSelectorName,
DpSelectorParm: vol.dpSelectorParm,
DpReadOnlyWhenVolFull: vol.DpReadOnlyWhenVolFull,
VolType: vol.VolType,
ObjBlockSize: vol.EbsBlkSize,
CacheCapacity: vol.CacheCapacity,
CacheAction: vol.CacheAction,
CacheThreshold: vol.CacheThreshold,
CacheLruInterval: vol.CacheLRUInterval,
CacheTtl: vol.CacheTTL,
CacheLowWater: vol.CacheLowWater,
CacheHighWater: vol.CacheHighWater,
CacheRule: vol.CacheRule,
PreloadCapacity: vol.getPreloadCapacity(),
LatestVer: vol.VersionMgr.getLatestVer(),
Forbidden: vol.Forbidden,
EnableAuditLog: vol.EnableAuditLog,
}
vol.uidSpaceManager.RLock()
defer vol.uidSpaceManager.RUnlock()
for _, uid := range vol.uidSpaceManager.uidInfo {
view.Uids = append(view.Uids, proto.UidSimpleInfo{
UID: uid.Uid,
Limited: uid.Limited,
})
}
return
}
func checkIp(addr string) bool {
ip := strings.Trim(addr, " ")
regStr := `^(([1-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){2}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])`
if match, _ := regexp.MatchString(regStr, ip); match {
return true
}
return false
}
func checkIpPort(addr string) bool {
var arr []string
if arr = strings.Split(addr, ":"); len(arr) < 2 {
return false
}
if id, err := strconv.ParseUint(arr[1], 10, 64); err != nil || id > 65535 || id < 1024 {
return false
}
ip := strings.Trim(addr, " ")
regStr := `^(([1-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){2}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])`
if match, _ := regexp.MatchString(regStr, ip); match {
return true
}
return false
}
func (m *Server) addDataNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
zoneName string
id uint64
err error
nodesetId uint64
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AddDataNode))
defer func() {
doStatAndMetric(proto.AddDataNode, metric, err, nil)
}()
if nodeAddr, zoneName, err = parseRequestForAddNode(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !checkIpPort(nodeAddr) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
return
}
var value string
if value = r.FormValue(idKey); value == "" {
nodesetId = 0
} else {
if nodesetId, err = strconv.ParseUint(value, 10, 64); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
}
if id, err = m.cluster.addDataNode(nodeAddr, zoneName, nodesetId); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(id))
}
func (m *Server) getDataNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
dataNode *DataNode
dataNodeInfo *proto.DataNodeInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetDataNode))
defer func() {
doStatAndMetric(proto.GetDataNode, metric, err, nil)
}()
if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dataNode, err = m.cluster.dataNode(nodeAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
log.LogDebugf("getDataNode. addr %v Total %v used %v", nodeAddr, dataNode.Total, dataNode.Used)
dataNode.PersistenceDataPartitions = m.cluster.getAllDataPartitionIDByDatanode(nodeAddr)
// some dp maybe removed from this node but decommission failed
dataNodeInfo = &proto.DataNodeInfo{
Total: dataNode.Total,
Used: dataNode.Used,
AvailableSpace: dataNode.AvailableSpace,
ID: dataNode.ID,
ZoneName: dataNode.ZoneName,
Addr: dataNode.Addr,
DomainAddr: dataNode.DomainAddr,
ReportTime: dataNode.ReportTime,
IsActive: dataNode.isActive,
IsWriteAble: dataNode.isWriteAble(),
UsageRatio: dataNode.UsageRatio,
SelectedTimes: dataNode.SelectedTimes,
DataPartitionReports: dataNode.DataPartitionReports,
DataPartitionCount: dataNode.DataPartitionCount,
NodeSetID: dataNode.NodeSetID,
PersistenceDataPartitions: dataNode.PersistenceDataPartitions,
BadDisks: dataNode.BadDisks,
RdOnly: dataNode.RdOnly,
MaxDpCntLimit: dataNode.GetDpCntLimit(),
CpuUtil: dataNode.CpuUtil.Load(),
IoUtils: dataNode.GetIoUtils(),
}
sendOkReply(w, r, newSuccessHTTPReply(dataNodeInfo))
}
// Decommission a data node. This will decommission all the data partition on that node.
func (m *Server) decommissionDataNode(w http.ResponseWriter, r *http.Request) {
var (
rstMsg string
offLineAddr string
raftForce bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.DecommissionDataNode))
defer func() {
doStatAndMetric(proto.DecommissionDataNode, metric, err, nil)
}()
if offLineAddr, err = parseDecomDataNodeReq(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
raftForce, err = parseRaftForce(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.dataNode(offLineAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
if err = m.cluster.migrateDataNode(offLineAddr, "", raftForce, 0); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg = fmt.Sprintf("decommission data node [%v] submited!need check status later!", offLineAddr)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) migrateDataNodeHandler(w http.ResponseWriter, r *http.Request) {
var (
srcAddr, targetAddr string
limit int
raftForce bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.MigrateDataNode))
defer func() {
doStatAndMetric(proto.MigrateDataNode, metric, err, nil)
}()
srcAddr, targetAddr, limit, err = parseMigrateNodeParam(r)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
raftForce, err = parseRaftForce(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
srcNode, err := m.cluster.dataNode(srcAddr)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeDataNodeNotExists, Msg: err.Error()})
return
}
targetNode, err := m.cluster.dataNode(targetAddr)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeDataNodeNotExists, Msg: err.Error()})
return
}
if srcNode.NodeSetID != targetNode.NodeSetID {
err = fmt.Errorf("src %s and target %s must exist in the same nodeSet when migrate", srcAddr, targetAddr)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if !targetNode.isWriteAble() || !targetNode.dpCntInLimit() {
err = fmt.Errorf("[%s] is not writable, can't used as target addr for migrate", targetAddr)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.migrateDataNode(srcAddr, targetAddr, raftForce, limit); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg := fmt.Sprintf("migrateDataNodeHandler from src [%v] to target[%v] has submited and run in asyn ways,need check laster!", srcAddr, targetAddr)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
// Decommission a data node. This will decommission all the data partition on that node.
func (m *Server) cancelDecommissionDataNode(w http.ResponseWriter, r *http.Request) {
var (
node *DataNode
rstMsg string
offLineAddr string
err error
dps []uint64
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.CancelDecommissionDataNode))
defer func() {
doStatAndMetric(proto.CancelDecommissionDataNode, metric, err, nil)
}()
if offLineAddr, err = parseAndExtractNodeAddr(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if node, err = m.cluster.dataNode(offLineAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
if err, dps = m.cluster.decommissionDataNodeCancel(node); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg = fmt.Sprintf("cancel decommission data node [%v] with paused failed[%v]", offLineAddr, dps)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) setNodeInfoHandler(w http.ResponseWriter, r *http.Request) {
var (
params map[string]interface{}
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetNodeInfo))
defer func() {
doStatAndMetric(proto.AdminSetNodeInfo, metric, err, nil)
}()
if params, err = parseAndExtractSetNodeInfoParams(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if batchCount, ok := params[nodeDeleteBatchCountKey]; ok {
if bc, ok := batchCount.(uint64); ok {
if err = m.cluster.setMetaNodeDeleteBatchCount(bc); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[clusterLoadFactorKey]; ok {
if factor, ok := val.(float32); ok {
if err = m.cluster.setClusterLoadFactor(factor); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[nodeMarkDeleteRateKey]; ok {
if v, ok := val.(uint64); ok {
if err = m.cluster.setDataNodeDeleteLimitRate(v); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[nodeAutoRepairRateKey]; ok {
if v, ok := val.(uint64); ok {
if err = m.cluster.setDataNodeAutoRepairLimitRate(v); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[nodeDpRepairTimeOutKey]; ok {
if v, ok := val.(uint64); ok {
if err = m.cluster.setDataPartitionRepairTimeOut(v); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[nodeDpMaxRepairErrCntKey]; ok {
if v, ok := val.(uint64); ok {
if err = m.cluster.setDataPartitionMaxRepairErrCnt(v); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[nodeDeleteWorkerSleepMs]; ok {
if v, ok := val.(uint64); ok {
if err = m.cluster.setMetaNodeDeleteWorkerSleepMs(v); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[maxDpCntLimitKey]; ok {
if v, ok := val.(uint64); ok {
if err = m.cluster.setMaxDpCntLimit(v); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
if val, ok := params[clusterCreateTimeKey]; ok {
if createTimeParam, ok := val.(string); ok {
var createTime time.Time
var err error
if createTime, err = time.ParseInLocation(proto.TimeFormat, createTimeParam, time.Local); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.setClusterCreateTime(createTime.Unix()); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
}
}
dataNodesetSelector := extractDataNodesetSelector(r)
metaNodesetSelector := extractMetaNodesetSelector(r)
dataNodeSelector := extractDataNodeSelector(r)
metaNodeSelector := extractMetaNodeSelector(r)
if err = m.updateClusterSelector(dataNodesetSelector, metaNodesetSelector, dataNodeSelector, metaNodeSelector); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodeinfo params %v successfully", params)))
}
func (m *Server) updateDataUseRatio(ratio float64) (err error) {
m.cluster.domainManager.dataRatioLimit = ratio
err = m.cluster.putZoneDomain(false)
return
}
func (m *Server) updateExcludeZoneUseRatio(ratio float64) (err error) {
m.cluster.domainManager.excludeZoneUseRatio = ratio
err = m.cluster.putZoneDomain(false)
return
}
func (m *Server) updateNodesetId(zoneName string, destNodesetId uint64, nodeType uint64, addr string) (err error) {
var (
nsId uint64
dstNs *nodeSet
srcNs *nodeSet
ok bool
value interface{}
metaNode *MetaNode
dataNode *DataNode
nodeTypeUint32 uint32
)
defer func() {
log.LogInfof("action[updateNodesetId] step out")
}()
log.LogWarnf("action[updateNodesetId] zonename[%v] destNodesetId[%v] nodeType[%v] addr[%v]",
zoneName, destNodesetId, nodeType, addr)
if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
return fmt.Errorf("zonename [%v] not found", zoneName)
}
zone := value.(*Zone)
if dstNs, ok = zone.nodeSetMap[destNodesetId]; !ok {
return fmt.Errorf("%v destNodesetId not found", destNodesetId)
}
if nodeType == uint64(TypeDataPartition) {
value, ok = zone.dataNodes.Load(addr)
if !ok {
return fmt.Errorf("addr %v not found", addr)
}
nsId = value.(*DataNode).NodeSetID
} else if nodeType == uint64(TypeMetaPartition) {
value, ok = zone.metaNodes.Load(addr)
if !ok {
return fmt.Errorf("addr %v not found", addr)
}
nsId = value.(*MetaNode).NodeSetID
} else {
return fmt.Errorf("%v wrong type", nodeType)
}
log.LogInfof("action[updateNodesetId] zonename[%v] destNodesetId[%v] nodeType[%v] addr[%v] get destid[%v]",
zoneName, destNodesetId, nodeType, addr, dstNs.ID)
srcNs = zone.nodeSetMap[nsId]
if srcNs.ID == dstNs.ID {
return fmt.Errorf("addr belong to same nodeset")
} else if srcNs.ID < dstNs.ID {
// take parallel call updatenodeid and defer unlock order into consider
srcNs.Lock()
dstNs.Lock()
defer srcNs.Unlock()
defer dstNs.Unlock()
} else {
// take parallel call updatenodeid and defer unlock order into consider
dstNs.Lock()
srcNs.Lock()
defer dstNs.Unlock()
defer srcNs.Unlock()
}
// the nodeset capcity not enlarged if node be added,capacity can be adjust by
// AdminUpdateNodeSetCapcity
if nodeType <= math.MaxUint32 {
nodeTypeUint32 = uint32(nodeType)
} else {
nodeTypeUint32 = math.MaxUint32
}
if nodeTypeUint32 == TypeDataPartition {
if value, ok = srcNs.dataNodes.Load(addr); !ok {
return fmt.Errorf("addr not found in srcNs.dataNodes")
}
dataNode = value.(*DataNode)
dataNode.NodeSetID = dstNs.ID
dstNs.putDataNode(dataNode)
srcNs.deleteDataNode(dataNode)
if err = m.cluster.syncUpdateDataNode(dataNode); err != nil {
dataNode.NodeSetID = srcNs.ID
return
}
} else {
if value, ok = srcNs.metaNodes.Load(addr); !ok {
return fmt.Errorf("ddr not found in srcNs.metaNodes")
}
metaNode = value.(*MetaNode)
metaNode.NodeSetID = dstNs.ID
dstNs.putMetaNode(metaNode)
srcNs.deleteMetaNode(metaNode)
if err = m.cluster.syncUpdateMetaNode(metaNode); err != nil {
dataNode.NodeSetID = srcNs.ID
return
}
}
if err = m.cluster.syncUpdateNodeSet(dstNs); err != nil {
return fmt.Errorf("warn:syncUpdateNodeSet dst srcNs [%v] failed", dstNs.ID)
}
if err = m.cluster.syncUpdateNodeSet(srcNs); err != nil {
return fmt.Errorf("warn:syncUpdateNodeSet src srcNs [%v] failed", srcNs.ID)
}
return
}
func (m *Server) updateZoneNodeSelector(zoneName string, dataNodeSelector string, metaNodeSelector string) (err error) {
var ok bool
var value interface{}
if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
err = fmt.Errorf("zonename [%v] not found", zoneName)
return
}
zone := value.(*Zone)
zone.nsLock.RLock()
defer zone.nsLock.RUnlock()
for _, ns := range zone.nodeSetMap {
needSync := false
if dataNodeSelector != "" && dataNodeSelector != ns.GetDataNodeSelector() {
ns.SetDataNodeSelector(dataNodeSelector)
needSync = true
}
if metaNodeSelector != "" && metaNodeSelector != ns.GetMetaNodeSelector() {
ns.SetMetaNodeSelector(metaNodeSelector)
needSync = true
}
if needSync {
err = m.cluster.syncUpdateNodeSet(ns)
if err != nil {
return
}
}
}
return
}
func (m *Server) updateZoneNodesetNodeSelector(zoneName string, nodesetId uint64, dataNodesetSelector string, metaNodesetSelector string) (err error) {
var ns *nodeSet
var ok bool
var value interface{}
if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
err = fmt.Errorf("zonename [%v] not found", zoneName)
return
}
zone := value.(*Zone)
if ns, ok = zone.nodeSetMap[nodesetId]; !ok {
err = fmt.Errorf("nodesetId [%v] not found", nodesetId)
return
}
needSync := false
if dataNodesetSelector != "" && dataNodesetSelector != ns.GetDataNodeSelector() {
ns.SetDataNodeSelector(dataNodesetSelector)
needSync = true
}
if metaNodesetSelector != "" && metaNodesetSelector != ns.GetMetaNodeSelector() {
ns.SetMetaNodeSelector(metaNodesetSelector)
needSync = true
}
if needSync {
err = m.cluster.syncUpdateNodeSet(ns)
if err != nil {
return
}
}
log.LogInfof("action[updateNodesetNodeSelector] zonename %v nodeset %v dataNodeSelector %v metaNodeSelector %v", zoneName, nodesetId, dataNodesetSelector, metaNodesetSelector)
return
}
func (m *Server) updateClusterSelector(dataNodesetSelector string, metaNodesetSelector string, dataNodeSelector string, metaNodeSelector string) (err error) {
m.cluster.t.zoneMap.Range(func(key, value interface{}) bool {
zone := value.(*Zone)
err = zone.updateNodesetSelector(m.cluster, dataNodesetSelector, metaNodesetSelector)
if err != nil {
return false
}
err = m.updateZoneNodeSelector(zone.name, dataNodeSelector, metaNodeSelector)
if err != nil {
return false
}
return true
})
return
}
func (m *Server) setDpRdOnly(partitionID uint64, rdOnly bool) (err error) {
var dp *DataPartition
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
return fmt.Errorf("[setPartitionRdOnly] getDataPartitionByID err(%s)", err.Error())
}
dp.RLock()
dp.RdOnly = rdOnly
m.cluster.syncUpdateDataPartition(dp)
dp.RUnlock()
return
}
func (m *Server) setNodeRdOnly(addr string, nodeType uint32, rdOnly bool) (err error) {
if nodeType == TypeDataPartition {
m.cluster.dnMutex.Lock()
defer m.cluster.dnMutex.Unlock()
value, ok := m.cluster.dataNodes.Load(addr)
if !ok {
return fmt.Errorf("[setNodeRdOnly] data node %s is not exist", addr)
}
dataNode := value.(*DataNode)
oldRdOnly := dataNode.RdOnly
dataNode.RdOnly = rdOnly
if err = m.cluster.syncUpdateDataNode(dataNode); err != nil {
dataNode.RdOnly = oldRdOnly
return fmt.Errorf("[setNodeRdOnly] syncUpdateDataNode err(%s)", err.Error())
}
return
}
m.cluster.mnMutex.Lock()
defer m.cluster.mnMutex.Unlock()
value, ok := m.cluster.metaNodes.Load(addr)
if !ok {
return fmt.Errorf("[setNodeRdOnly] meta node %s is not exist", addr)
}
metaNode := value.(*MetaNode)
oldRdOnly := metaNode.RdOnly
metaNode.RdOnly = rdOnly
if err = m.cluster.syncUpdateMetaNode(metaNode); err != nil {
metaNode.RdOnly = oldRdOnly
return fmt.Errorf("[setNodeRdOnly] syncUpdateMetaNode err(%s)", err.Error())
}
return
}
func (m *Server) updateNodesetCapcity(zoneName string, nodesetId uint64, capcity uint64) (err error) {
var ns *nodeSet
var ok bool
var value interface{}
if capcity < defaultReplicaNum || capcity > 100 {
err = fmt.Errorf("capcity [%v] value out of scope", capcity)
return
}
if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
err = fmt.Errorf("zonename [%v] not found", zoneName)
return
}
zone := value.(*Zone)
if ns, ok = zone.nodeSetMap[nodesetId]; !ok {
err = fmt.Errorf("nodesetId [%v] not found", nodesetId)
return
}
ns.Capacity = int(capcity)
m.cluster.syncUpdateNodeSet(ns)
log.LogInfof("action[updateNodesetCapcity] zonename %v nodeset %v capcity %v", zoneName, nodesetId, capcity)
return
}
func (m *Server) buildNodeSetGrpInfoByID(domainId, grpId uint64) (*proto.SimpleNodeSetGrpInfo, error) {
domainIndex := m.cluster.domainManager.domainId2IndexMap[domainId]
nsgm := m.cluster.domainManager.domainNodeSetGrpVec[domainIndex]
var index int
for index = 0; index < len(nsgm.nodeSetGrpMap); index++ {
if nsgm.nodeSetGrpMap[index].ID == grpId {
break
}
if nsgm.nodeSetGrpMap[index].ID > grpId {
return nil, fmt.Errorf("id not found")
}
}
if index == len(nsgm.nodeSetGrpMap) {
return nil, fmt.Errorf("id not found")
}
return m.buildNodeSetGrpInfo(nsgm.nodeSetGrpMap[index]), nil
}
func (m *Server) buildNodeSetGrpInfo(nsg *nodeSetGroup) *proto.SimpleNodeSetGrpInfo {
nsgStat := new(proto.SimpleNodeSetGrpInfo)
nsgStat.ID = nsg.ID
nsgStat.Status = nsg.status
for i := 0; i < len(nsg.nodeSets); i++ {
var nsStat proto.NodeSetInfo
nsStat.ID = nsg.nodeSets[i].ID
nsStat.Capacity = nsg.nodeSets[i].Capacity
nsStat.ZoneName = nsg.nodeSets[i].zoneName
nsg.nodeSets[i].dataNodes.Range(func(key, value interface{}) bool {
node := value.(*DataNode)
nsStat.DataTotal += node.Total
if node.isWriteAble() {
nsStat.DataUsed += node.Used
} else {
nsStat.DataUsed += node.Total
}
log.LogInfof("nodeset index[%v], datanode nodeset id[%v],zonename[%v], addr[%v] inner nodesetid[%v]",
i, nsStat.ID, node.ZoneName, node.Addr, node.NodeSetID)
dataNodeInfo := &proto.DataNodeInfo{
Total: node.Total,
Used: node.Used,
AvailableSpace: node.AvailableSpace,
ID: node.ID,
ZoneName: node.ZoneName,
Addr: node.Addr,
ReportTime: node.ReportTime,
IsActive: node.isActive,
IsWriteAble: node.isWriteAble(),
UsageRatio: node.UsageRatio,
SelectedTimes: node.SelectedTimes,
DataPartitionCount: node.DataPartitionCount,
NodeSetID: node.NodeSetID,
}
nsStat.DataNodes = append(nsStat.DataNodes, dataNodeInfo)
return true
})
nsStat.DataUseRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", float64(nsStat.DataUsed)/float64(nsStat.DataTotal)), 64)
nsg.nodeSets[i].metaNodes.Range(func(key, value interface{}) bool {
node := value.(*MetaNode)
nsStat.MetaTotal += node.Total
nsStat.MetaUsed += node.Used
log.LogInfof("nodeset index[%v], metanode nodeset id[%v],zonename[%v], addr[%v] inner nodesetid[%v]",
i, nsStat.ID, node.ZoneName, node.Addr, node.NodeSetID)
metaNodeInfo := &proto.MetaNodeInfo{
ID: node.ID,
Addr: node.Addr,
IsActive: node.IsActive,
IsWriteAble: node.isWritable(),
ZoneName: node.ZoneName,
MaxMemAvailWeight: node.MaxMemAvailWeight,
Total: node.Total,
Used: node.Used,
Ratio: node.Ratio,
SelectCount: node.SelectCount,
Threshold: node.Threshold,
ReportTime: node.ReportTime,
MetaPartitionCount: node.MetaPartitionCount,
NodeSetID: node.NodeSetID,
}
nsStat.MetaNodes = append(nsStat.MetaNodes, metaNodeInfo)
return true
})
nsStat.MetaUseRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", float64(nsStat.MetaUsed)/float64(nsStat.MetaTotal)), 64)
nsgStat.NodeSetInfo = append(nsgStat.NodeSetInfo, nsStat)
log.LogInfof("nodeset index[%v], nodeset id[%v],capacity[%v], datatotal[%v] dataused[%v] metatotal[%v] metaused[%v], metanode[%v], datanodes[%v]",
i, nsStat.ID, nsStat.Capacity, nsStat.DataTotal, nsStat.DataUsed, nsStat.MetaTotal, nsStat.MetaUsed, nsStat.MetaNodes, nsStat.DataNodes)
}
return nsgStat
}
func parseSetNodeRdOnlyParam(r *http.Request) (addr string, nodeType uint32, rdOnly bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
if addr = r.FormValue(addrKey); addr == "" {
err = fmt.Errorf("parseSetNodeRdOnlyParam %s is empty", addrKey)
return
}
if nodeType, err = parseNodeType(r); err != nil {
return
}
val := r.FormValue(rdOnlyKey)
if val == "" {
err = fmt.Errorf("parseSetNodeRdOnlyParam %s is empty", rdOnlyKey)
return
}
if rdOnly, err = strconv.ParseBool(val); err != nil {
err = fmt.Errorf("parseSetNodeRdOnlyParam %s is not bool value %s", rdOnlyKey, val)
return
}
return
}
func parseSetDpRdOnlyParam(r *http.Request) (dpId uint64, rdOnly bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
if dpId, err = extractDataPartitionID(r); err != nil {
err = fmt.Errorf("parseSetDpRdOnlyParam get dpid error %v", err)
return
}
val := r.FormValue(rdOnlyKey)
if val == "" {
err = fmt.Errorf("parseSetDpRdOnlyParam %s is empty", rdOnlyKey)
return
}
if rdOnly, err = strconv.ParseBool(val); err != nil {
err = fmt.Errorf("parseSetDpRdOnlyParam %s is not bool value %s", rdOnlyKey, val)
return
}
return
}
func parseNodeType(r *http.Request) (nodeType uint32, err error) {
var val string
var nodeTypeUint64 uint64
if val = r.FormValue(nodeTypeKey); val == "" {
err = fmt.Errorf("parseSetNodeRdOnlyParam %s is empty", nodeTypeKey)
return
}
if nodeTypeUint64, err = strconv.ParseUint(val, 10, 32); err != nil {
err = fmt.Errorf("parseSetNodeRdOnlyParam %s is not number, err %s", nodeTypeKey, err.Error())
return
}
nodeType = uint32(nodeTypeUint64)
if nodeType != TypeDataPartition && nodeType != TypeMetaPartition {
err = fmt.Errorf("parseSetNodeRdOnlyParam %s is not legal, must be %d or %d", nodeTypeKey, TypeDataPartition, TypeMetaPartition)
return
}
return
}
func (m *Server) setNodeRdOnlyHandler(w http.ResponseWriter, r *http.Request) {
var (
addr string
nodeType uint32
rdOnly bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetNodeRdOnly))
defer func() {
doStatAndMetric(proto.AdminSetNodeRdOnly, metric, err, nil)
}()
addr, nodeType, rdOnly, err = parseSetNodeRdOnlyParam(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
log.LogInfof("[setNodeRdOnlyHandler] set node %s to rdOnly(%v)", addr, rdOnly)
err = m.setNodeRdOnly(addr, nodeType, rdOnly)
if err != nil {
log.LogErrorf("[setNodeRdOnlyHandler] set node %s to rdOnly %v, err (%s)", addr, rdOnly, err.Error())
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("[setNodeRdOnlyHandler] set node %s to rdOnly(%v) success", addr, rdOnly)))
return
}
func (m *Server) setDpRdOnlyHandler(w http.ResponseWriter, r *http.Request) {
var (
dpId uint64
rdOnly bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetDpRdOnly))
defer func() {
doStatAndMetric(proto.AdminSetDpRdOnly, metric, err, nil)
}()
dpId, rdOnly, err = parseSetDpRdOnlyParam(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
log.LogInfof("[setNodeRdOnlyHandler] set dp %v to rdOnly(%v)", dpId, rdOnly)
err = m.setDpRdOnly(dpId, rdOnly)
if err != nil {
log.LogErrorf("[setNodeRdOnlyHandler] set dp %v to rdOnly %v, err (%s)", dpId, rdOnly, err.Error())
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("[setNodeRdOnlyHandler] set dpid %v to rdOnly(%v) success", dpId, rdOnly)))
return
}
func (m *Server) updateNodeSetCapacityHandler(w http.ResponseWriter, r *http.Request) {
var (
params map[string]interface{}
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateNodeSetCapcity))
defer func() {
doStatAndMetric(proto.AdminUpdateNodeSetCapcity, metric, err, nil)
}()
if params, err = parseAndExtractSetNodeSetInfoParams(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err := m.updateNodesetCapcity(params[zoneNameKey].(string), params[idKey].(uint64), params[countKey].(uint64)); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply("set nodesetinfo successfully"))
}
func (m *Server) updateDataUseRatioHandler(w http.ResponseWriter, r *http.Request) {
var (
params map[string]interface{}
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateDomainDataUseRatio))
defer func() {
doStatAndMetric(proto.AdminUpdateDomainDataUseRatio, metric, err, nil)
}()
var value string
if value = r.FormValue(ratio); value == "" {
err = keyNotFound(ratio)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
var ratioVal float64
if ratioVal, err = strconv.ParseFloat(value, 64); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if ratioVal == 0 || ratioVal > 1 {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "ratioVal is not legal"})
return
}
if err = m.updateDataUseRatio(ratioVal); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodesetinfo params %v successfully", params)))
}
func (m *Server) updateZoneExcludeRatioHandler(w http.ResponseWriter, r *http.Request) {
var (
params map[string]interface{}
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateZoneExcludeRatio))
defer func() {
doStatAndMetric(proto.AdminUpdateZoneExcludeRatio, metric, err, nil)
}()
var value string
if value = r.FormValue(ratio); value == "" {
err = keyNotFound(ratio)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
var ratioVal float64
if ratioVal, err = strconv.ParseFloat(value, 64); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.updateExcludeZoneUseRatio(ratioVal); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodesetinfo params %v successfully", params)))
}
func (m *Server) updateNodeSetIdHandler(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
id uint64
zoneName string
err error
nodeType uint64
value string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateNodeSetId))
defer func() {
doStatAndMetric(proto.AdminUpdateNodeSetId, metric, err, nil)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
}
}()
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
zoneName = DefaultZoneName
}
if err = r.ParseForm(); err != nil {
return
}
if nodeAddr, err = extractNodeAddr(r); err != nil {
return
}
if id, err = extractNodeID(r); err != nil {
return
}
if value = r.FormValue(nodeTypeKey); value == "" {
err = fmt.Errorf("need param nodeType")
return
}
if nodeType, err = strconv.ParseUint(value, 10, 64); err != nil {
return
}
if err = m.updateNodesetId(zoneName, id, nodeType, nodeAddr); err != nil {
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("update node setid successfully")))
}
func (m *Server) updateNodeSetNodeSelector(w http.ResponseWriter, r *http.Request) {
var (
id uint64
zoneName string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateNodeSetNodeSelector))
defer func() {
doStatAndMetric(proto.AdminUpdateNodeSetNodeSelector, metric, err, nil)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
}
}()
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
zoneName = DefaultZoneName
}
if err = r.ParseForm(); err != nil {
return
}
if id, err = extractNodesetID(r); err != nil {
return
}
dataNodeSelector := extractDataNodeSelector(r)
metaNodeSelector := r.FormValue(metaNodeSelectorKey)
if err = m.updateZoneNodesetNodeSelector(zoneName, id, dataNodeSelector, metaNodeSelector); err != nil {
return
}
sendOkReply(w, r, newSuccessHTTPReply("update nodeset selector successfully"))
}
// get metanode some interval params
func (m *Server) getNodeSetGrpInfoHandler(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetNodeSetGrpInfo))
defer func() {
doStatAndMetric(proto.AdminGetNodeSetGrpInfo, metric, err, nil)
}()
if err = r.ParseForm(); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
var value string
var id uint64
if value = r.FormValue(idKey); value != "" {
id, err = strconv.ParseUint(value, 10, 64)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
}
var domainId uint64
if value = r.FormValue(domainIdKey); value != "" {
domainId, err = strconv.ParseUint(value, 10, 64)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
}
log.LogInfof("action[getNodeSetGrpInfoHandler] id [%v]", id)
var info *proto.SimpleNodeSetGrpInfo
if info, err = m.buildNodeSetGrpInfoByID(domainId, id); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(info))
}
func (m *Server) getIsDomainOn(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetIsDomainOn))
defer func() {
doStatAndMetric(proto.AdminGetIsDomainOn, metric, nil, nil)
}()
type SimpleDomainInfo struct {
DomainOn bool
}
nsglStat := new(SimpleDomainInfo)
nsglStat.DomainOn = m.cluster.FaultDomain
sendOkReply(w, r, newSuccessHTTPReply(nsglStat))
}
func (m *Server) createDomainHandler(w http.ResponseWriter, r *http.Request) {
nsgm := m.cluster.domainManager
var (
zoneName string
err error
)
if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("zonename null").Error()})
return
}
if err = nsgm.createDomain(zoneName); err != nil {
log.LogErrorf("action[createDomainHandler] err [%v]", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("successful")))
}
// get metanode some interval params
func (m *Server) getAllNodeSetGrpInfoHandler(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetAllNodeSetGrpInfo))
defer func() {
doStatAndMetric(proto.AdminGetAllNodeSetGrpInfo, metric, nil, nil)
}()
nsgm := m.cluster.domainManager
nsglStat := new(proto.DomainNodeSetGrpInfoList)
nsglStat.DomainOn = m.cluster.FaultDomain
nsglStat.NeedDomain = m.cluster.needFaultDomain
nsglStat.DataRatioLimit = nsgm.dataRatioLimit
nsglStat.ZoneExcludeRatioLimit = nsgm.excludeZoneUseRatio
nsglStat.ExcludeZones = nsgm.c.t.domainExcludeZones
for i := 0; i < len(nsgm.domainNodeSetGrpVec); i++ {
nodeSetGrpInfoList := &proto.SimpleNodeSetGrpInfoList{}
nodeSetGrpInfoList.DomainId = nsgm.domainNodeSetGrpVec[i].domainId
nodeSetGrpInfoList.Status = nsgm.domainNodeSetGrpVec[i].status
nsglStat.DomainNodeSetGrpInfo = append(nsglStat.DomainNodeSetGrpInfo, nodeSetGrpInfoList)
nodeSetGrpInfoList.Status = nsgm.domainNodeSetGrpVec[i].status
log.LogInfof("action[getAllNodeSetGrpInfoHandler] start build domain id [%v]", nsgm.domainNodeSetGrpVec[i].domainId)
for j := 0; j < len(nsgm.domainNodeSetGrpVec[i].nodeSetGrpMap); j++ {
log.LogInfof("action[getAllNodeSetGrpInfoHandler] build domain id [%v] nodeset group index [%v] Print inner nodeset now!",
nsgm.domainNodeSetGrpVec[i].domainId, j)
nodeSetGrpInfoList.SimpleNodeSetGrpInfo = append(nodeSetGrpInfoList.SimpleNodeSetGrpInfo,
m.buildNodeSetGrpInfo(nsgm.domainNodeSetGrpVec[i].nodeSetGrpMap[j]))
}
}
sendOkReply(w, r, newSuccessHTTPReply(nsglStat))
}
// get metanode some interval params
func (m *Server) getNodeInfoHandler(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetNodeInfo))
defer func() {
doStatAndMetric(proto.AdminGetNodeInfo, metric, nil, nil)
}()
resp := make(map[string]string)
resp[nodeDeleteBatchCountKey] = fmt.Sprintf("%v", m.cluster.cfg.MetaNodeDeleteBatchCount)
resp[nodeMarkDeleteRateKey] = fmt.Sprintf("%v", m.cluster.cfg.DataNodeDeleteLimitRate)
resp[nodeDeleteWorkerSleepMs] = fmt.Sprintf("%v", m.cluster.cfg.MetaNodeDeleteWorkerSleepMs)
resp[nodeAutoRepairRateKey] = fmt.Sprintf("%v", m.cluster.cfg.DataNodeAutoRepairLimitRate)
resp[nodeDpRepairTimeOutKey] = fmt.Sprintf("%v", m.cluster.cfg.DpRepairTimeOut)
resp[nodeDpMaxRepairErrCntKey] = fmt.Sprintf("%v", m.cluster.cfg.DpMaxRepairErrCnt)
resp[clusterLoadFactorKey] = fmt.Sprintf("%v", m.cluster.cfg.ClusterLoadFactor)
resp[maxDpCntLimitKey] = fmt.Sprintf("%v", m.cluster.cfg.MaxDpCntLimit)
sendOkReply(w, r, newSuccessHTTPReply(resp))
}
func (m *Server) diagnoseMetaPartition(w http.ResponseWriter, r *http.Request) {
var (
err error
rstMsg *proto.MetaPartitionDiagnosis
inactiveNodes []string
noLeaderMps []*MetaPartition
lackReplicaMps []*MetaPartition
badReplicaMps []*MetaPartition
excessReplicaMPs []*MetaPartition
inodeCountNotEqualReplicaMps []*MetaPartition
maxInodeNotEqualMPs []*MetaPartition
dentryCountNotEqualReplicaMps []*MetaPartition
corruptMpIDs []uint64
lackReplicaMpIDs []uint64
badReplicaMpIDs []uint64
excessReplicaMpIDs []uint64
inodeCountNotEqualReplicaMpIDs []uint64
maxInodeNotEqualReplicaMpIDs []uint64
dentryCountNotEqualReplicaMpIDs []uint64
badMetaPartitions []badPartitionView
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDiagnoseMetaPartition))
defer func() {
doStatAndMetric(proto.AdminDiagnoseMetaPartition, metric, err, nil)
}()
corruptMpIDs = make([]uint64, 0)
lackReplicaMpIDs = make([]uint64, 0)
badReplicaMpIDs = make([]uint64, 0)
excessReplicaMpIDs = make([]uint64, 0)
if inactiveNodes, err = m.cluster.checkInactiveMetaNodes(); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if lackReplicaMps, noLeaderMps, badReplicaMps, excessReplicaMPs,
inodeCountNotEqualReplicaMps, maxInodeNotEqualMPs, dentryCountNotEqualReplicaMps, err = m.cluster.checkReplicaMetaPartitions(); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
for _, mp := range noLeaderMps {
corruptMpIDs = append(corruptMpIDs, mp.PartitionID)
}
for _, mp := range lackReplicaMps {
lackReplicaMpIDs = append(lackReplicaMpIDs, mp.PartitionID)
}
for _, mp := range badReplicaMps {
badReplicaMpIDs = append(badReplicaMpIDs, mp.PartitionID)
}
for _, mp := range excessReplicaMPs {
excessReplicaMpIDs = append(excessReplicaMpIDs, mp.PartitionID)
}
for _, mp := range inodeCountNotEqualReplicaMps {
inodeCountNotEqualReplicaMpIDs = append(inodeCountNotEqualReplicaMpIDs, mp.PartitionID)
}
for _, mp := range maxInodeNotEqualMPs {
maxInodeNotEqualReplicaMpIDs = append(maxInodeNotEqualReplicaMpIDs, mp.PartitionID)
}
for _, mp := range dentryCountNotEqualReplicaMps {
dentryCountNotEqualReplicaMpIDs = append(dentryCountNotEqualReplicaMpIDs, mp.PartitionID)
}
badMetaPartitions = m.cluster.getBadMetaPartitionsView()
rstMsg = &proto.MetaPartitionDiagnosis{
InactiveMetaNodes: inactiveNodes,
CorruptMetaPartitionIDs: corruptMpIDs,
LackReplicaMetaPartitionIDs: lackReplicaMpIDs,
BadMetaPartitionIDs: badMetaPartitions,
BadReplicaMetaPartitionIDs: badReplicaMpIDs,
ExcessReplicaMetaPartitionIDs: excessReplicaMpIDs,
InodeCountNotEqualReplicaMetaPartitionIDs: inodeCountNotEqualReplicaMpIDs,
MaxInodeNotEqualReplicaMetaPartitionIDs: maxInodeNotEqualReplicaMpIDs,
DentryCountNotEqualReplicaMetaPartitionIDs: dentryCountNotEqualReplicaMpIDs,
}
log.LogInfof("diagnose metaPartition cluster[%v], inactiveNodes:[%v], corruptMpIDs:[%v], "+
"lackReplicaMpIDs:[%v], badReplicaMpIDs:[%v], excessReplicaDpIDs[%v] "+
"inodeCountNotEqualReplicaMpIDs[%v] dentryCountNotEqualReplicaMpIDs[%v]",
m.cluster.Name, inactiveNodes, corruptMpIDs, lackReplicaMpIDs, badReplicaMpIDs, excessReplicaMpIDs,
inodeCountNotEqualReplicaMpIDs, dentryCountNotEqualReplicaMpIDs)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
// Decommission a disk. This will decommission all the data partitions on this disk.
// If parameter diskDisable is true, creating data partitions on this disk will be not allowed.
func (m *Server) decommissionDisk(w http.ResponseWriter, r *http.Request) {
var (
rstMsg string
offLineAddr, diskPath string
diskDisable bool
err error
raftForce bool
limit int
decommissionType int
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.DecommissionDisk))
defer func() {
doStatAndMetric(proto.DecommissionDisk, metric, err, nil)
}()
// default diskDisable is true
if offLineAddr, diskPath, diskDisable, limit, decommissionType, err = parseReqToDecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
raftForce, err = parseRaftForce(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.migrateDisk(offLineAddr, diskPath, "", raftForce, limit, diskDisable, uint32(decommissionType)); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg = fmt.Sprintf("decommission disk [%v:%v] submited!need check status later!", offLineAddr, diskPath)
Warn(m.clusterName, rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
// Recommission a disk which is set disable when decommissioning. This will allow creating data partitions on this disk again.
func (m *Server) recommissionDisk(w http.ResponseWriter, r *http.Request) {
var (
node *DataNode
rstMsg string
onLineAddr, diskPath string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.RecommissionDisk))
defer func() {
doStatAndMetric(proto.RecommissionDisk, metric, err, nil)
}()
if onLineAddr, diskPath, err = parseReqToRecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if node, err = m.cluster.dataNode(onLineAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
if err = m.cluster.deleteAndSyncDecommissionedDisk(node, diskPath); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg = fmt.Sprintf("receive recommissionDisk node[%v] disk[%v], and recommission successfully",
node.Addr, diskPath)
Warn(m.clusterName, rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) restoreStoppedAutoDecommissionDisk(w http.ResponseWriter, r *http.Request) {
var (
rstMsg string
offLineAddr, diskPath string
err error
)
metric := exporter.NewTPCnt("req_restoreStoppedAutoDecommissionDisk")
defer func() {
metric.Set(err)
}()
if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.restoreStoppedAutoDecommissionDisk(offLineAddr, diskPath); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg = fmt.Sprintf("restoreStoppedAutoDecommissionDisk node[%v] disk[%v] submited!need check status later!",
offLineAddr, diskPath)
Warn(m.clusterName, rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) queryDiskDecoProgress(w http.ResponseWriter, r *http.Request) {
var (
offLineAddr, diskPath string
err error
)
metric := exporter.NewTPCnt("req_queryDiskDecoProgress")
defer func() {
metric.Set(err)
}()
if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
value, ok := m.cluster.DecommissionDisks.Load(key)
if !ok {
ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for node[%v] disk[%v], "+
"may be already offline", offLineAddr, diskPath)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
return
}
disk := value.(*DecommissionDisk)
status, progress := disk.updateDecommissionStatus(m.cluster, true)
progress, _ = FormatFloatFloor(progress, 4)
resp := &proto.DecommissionProgress{
Status: status,
Progress: fmt.Sprintf("%.2f%%", progress*float64(100)),
StatusMessage: GetDecommissionStatusMessage(status),
}
if status == DecommissionFail {
dps := disk.GetLatestDecommissionDP(m.cluster)
dpIds := make([]uint64, 0)
for _, dp := range dps {
if dp.IsDecommissionFailed() {
dpIds = append(dpIds, dp.PartitionID)
}
}
resp.FailedDps = dpIds
}
sendOkReply(w, r, newSuccessHTTPReply(resp))
}
func (m *Server) queryDecommissionDiskDecoFailedDps(w http.ResponseWriter, r *http.Request) {
var (
offLineAddr, diskPath string
err error
)
metric := exporter.NewTPCnt("req_queryDecommissionDiskDecoFailedDps")
defer func() {
metric.Set(err)
}()
if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
value, ok := m.cluster.DecommissionDisks.Load(key)
if !ok {
ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for node[%v] disk[%v], "+
"may be already offline", offLineAddr, diskPath)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
return
}
disk := value.(*DecommissionDisk)
err, dps := disk.GetDecommissionFailedDP(m.cluster)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(dps))
}
func (m *Server) queryAllDecommissionDisk(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt("req_queryAllDecommissionDisk")
defer func() {
metric.Set(err)
}()
resp := &proto.DecommissionDisksResponse{}
m.cluster.DecommissionDisks.Range(func(key, value interface{}) bool {
disk := value.(*DecommissionDisk)
info := proto.DecommissionDiskInfo{
SrcAddr: disk.SrcAddr,
DiskPath: disk.DiskPath,
DecommissionStatus: disk.GetDecommissionStatus(),
DecommissionRaftForce: disk.DecommissionRaftForce,
DecommissionRetry: disk.DecommissionRetry,
DecommissionDpTotal: disk.DecommissionDpTotal,
DecommissionTerm: disk.DecommissionTerm,
DecommissionLimit: disk.DecommissionDpCount,
Type: disk.Type,
DecommissionCompleteTime: disk.DecommissionCompleteTime,
}
_, info.Progress = disk.updateDecommissionStatus(m.cluster, true)
resp.Infos = append(resp.Infos, info)
return true
})
sendOkReply(w, r, newSuccessHTTPReply(resp))
}
func (m *Server) markDecoDiskFixed(w http.ResponseWriter, r *http.Request) {
var (
offLineAddr, diskPath string
err error
)
metric := exporter.NewTPCnt("req_markDecoDiskFixed")
defer func() {
metric.Set(err)
}()
if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
value, ok := m.cluster.DecommissionDisks.Load(key)
if !ok {
ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for node[%v] disk[%v], "+
"may be already offline", offLineAddr, diskPath)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
return
}
disk := value.(*DecommissionDisk)
err = m.cluster.syncDeleteDecommissionDisk(disk)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
return
}
m.cluster.DecommissionDisks.Delete(disk.GenerateKey())
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
func (m *Server) cancelDecommissionDisk(w http.ResponseWriter, r *http.Request) {
var (
offLineAddr, diskPath string
err error
)
metric := exporter.NewTPCnt("req_cancelDecommissionDisk")
defer func() {
metric.Set(err)
}()
if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
value, ok := m.cluster.DecommissionDisks.Load(key)
if !ok {
ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for node[%v] disk[%v], "+
"may be already offline", offLineAddr, diskPath)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
return
}
disk := value.(*DecommissionDisk)
err, dps := m.cluster.decommissionDiskCancel(disk)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg := fmt.Sprintf("cancel decommission data node [%s] disk[%s] successfully with failed dp %v",
offLineAddr, diskPath, dps)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
// handle tasks such as heartbeat,loadDataPartition,deleteDataPartition, etc.
func (m *Server) handleDataNodeTaskResponse(w http.ResponseWriter, r *http.Request) {
var (
tr *proto.AdminTask
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetDataNodeTaskResponse))
defer func() {
doStatAndMetric(proto.GetDataNodeTaskResponse, metric, err, nil)
}()
tr, err = parseRequestToGetTaskResponse(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("%v", http.StatusOK)))
m.cluster.handleDataNodeTaskResponse(tr.OperatorAddr, tr)
}
func (m *Server) addMetaNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
zoneName string
id uint64
err error
nodesetId uint64
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AddMetaNode))
defer func() {
doStatAndMetric(proto.AddMetaNode, metric, err, nil)
}()
if nodeAddr, zoneName, err = parseRequestForAddNode(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !checkIpPort(nodeAddr) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
return
}
var value string
if value = r.FormValue(idKey); value == "" {
nodesetId = 0
} else {
if nodesetId, err = strconv.ParseUint(value, 10, 64); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
}
if id, err = m.cluster.addMetaNode(nodeAddr, zoneName, nodesetId); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(id))
}
func (m *Server) checkInvalidIDNodes(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetInvalidNodes))
defer func() {
doStatAndMetric(proto.AdminGetInvalidNodes, metric, nil, nil)
}()
nodes := m.cluster.getInvalidIDNodes()
sendOkReply(w, r, newSuccessHTTPReply(nodes))
}
func (m *Server) updateDataNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
id uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateDataNode))
defer func() {
doStatAndMetric(proto.AdminUpdateDataNode, metric, err, nil)
}()
if nodeAddr, id, err = parseRequestForUpdateMetaNode(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.updateDataNodeBaseInfo(nodeAddr, id); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(id))
}
func (m *Server) updateMetaNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
id uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateMetaNode))
defer func() {
doStatAndMetric(proto.AdminUpdateMetaNode, metric, err, nil)
}()
if nodeAddr, id, err = parseRequestForUpdateMetaNode(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.updateMetaNodeBaseInfo(nodeAddr, id); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(id))
}
func (m *Server) getMetaNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
metaNode *MetaNode
metaNodeInfo *proto.MetaNodeInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetMetaNode))
defer func() {
doStatAndMetric(proto.GetMetaNode, metric, err, nil)
}()
if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if metaNode, err = m.cluster.metaNode(nodeAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaNodeNotExists))
return
}
metaNode.PersistenceMetaPartitions = m.cluster.getAllMetaPartitionIDByMetaNode(nodeAddr)
metaNodeInfo = &proto.MetaNodeInfo{
ID: metaNode.ID,
Addr: metaNode.Addr,
DomainAddr: metaNode.DomainAddr,
IsActive: metaNode.IsActive,
IsWriteAble: metaNode.isWritable(),
ZoneName: metaNode.ZoneName,
MaxMemAvailWeight: metaNode.MaxMemAvailWeight,
Total: metaNode.Total,
Used: metaNode.Used,
Ratio: metaNode.Ratio,
SelectCount: metaNode.SelectCount,
Threshold: metaNode.Threshold,
ReportTime: metaNode.ReportTime,
MetaPartitionCount: metaNode.MetaPartitionCount,
NodeSetID: metaNode.NodeSetID,
PersistenceMetaPartitions: metaNode.PersistenceMetaPartitions,
CpuUtil: metaNode.CpuUtil.Load(),
}
sendOkReply(w, r, newSuccessHTTPReply(metaNodeInfo))
}
func (m *Server) decommissionMetaPartition(w http.ResponseWriter, r *http.Request) {
var (
partitionID uint64
nodeAddr string
mp *MetaPartition
msg string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDecommissionMetaPartition))
defer func() {
doStatAndMetric(proto.AdminDecommissionMetaPartition, metric, err, nil)
}()
if partitionID, nodeAddr, err = parseRequestToDecommissionMetaPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
return
}
if err = m.cluster.decommissionMetaPartition(nodeAddr, mp); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf(proto.AdminDecommissionMetaPartition+" partitionID :%v decommissionMetaPartition successfully", partitionID)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func parseMigrateNodeParam(r *http.Request) (srcAddr, targetAddr string, limit int, err error) {
if err = r.ParseForm(); err != nil {
return
}
srcAddr = r.FormValue(srcAddrKey)
if srcAddr == "" {
err = fmt.Errorf("parseMigrateNodeParam %s can't be empty", srcAddrKey)
return
}
if ipAddr, ok := util.ParseAddrToIpAddr(srcAddr); ok {
srcAddr = ipAddr
}
targetAddr = r.FormValue(targetAddrKey)
if targetAddr == "" {
err = fmt.Errorf("parseMigrateNodeParam %s can't be empty when migrate", targetAddrKey)
return
}
if ipAddr, ok := util.ParseAddrToIpAddr(targetAddr); ok {
targetAddr = ipAddr
}
if srcAddr == targetAddr {
err = fmt.Errorf("parseMigrateNodeParam srcAddr %s can't be equal to targetAddr %s", srcAddr, targetAddr)
return
}
limit, err = parseUintParam(r, countKey)
if err != nil {
return
}
return
}
func parseUintParam(r *http.Request, key string) (num int, err error) {
val := r.FormValue(key)
if val == "" {
num = 0
return
}
numVal, err := strconv.ParseInt(val, 10, 32)
if err != nil {
err = fmt.Errorf("parseUintParam %s-%s is not legal, err %s", key, val, err.Error())
return
}
num = int(numVal)
return
}
func (m *Server) loadMetaPartition(w http.ResponseWriter, r *http.Request) {
var (
msg string
mp *MetaPartition
partitionID uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminLoadMetaPartition))
defer func() {
doStatAndMetric(proto.AdminLoadMetaPartition, metric, err, nil)
}()
if partitionID, err = parseRequestToLoadMetaPartition(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
return
}
m.cluster.loadMetaPartitionAndCheckResponse(mp)
msg = fmt.Sprintf(proto.AdminLoadMetaPartition+" partitionID :%v Load successfully", partitionID)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) migrateMetaNodeHandler(w http.ResponseWriter, r *http.Request) {
var (
srcAddr string
targetAddr string
limit int
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.MigrateMetaNode))
defer func() {
doStatAndMetric(proto.MigrateMetaNode, metric, err, nil)
}()
srcAddr, targetAddr, limit, err = parseMigrateNodeParam(r)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if limit > defaultMigrateMpCnt {
err = fmt.Errorf("limit %d can't be bigger than %d", limit, defaultMigrateMpCnt)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
srcNode, err := m.cluster.metaNode(srcAddr)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeMetaNodeNotExists, Msg: err.Error()})
return
}
targetNode, err := m.cluster.metaNode(targetAddr)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeMetaNodeNotExists, Msg: err.Error()})
return
}
if srcNode.NodeSetID != targetNode.NodeSetID {
err = fmt.Errorf("src %s and target %s must exist in the same nodeSet when migrate", srcAddr, targetAddr)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if !targetNode.isWritable() {
err = fmt.Errorf("[%s] is not writable, can't used as target addr for migrate", targetAddr)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = m.cluster.migrateMetaNode(srcAddr, targetAddr, limit); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg := fmt.Sprintf("migrateMetaNodeHandler from src [%v] to targaet[%s] has migrate successfully", srcAddr, targetAddr)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) decommissionMetaNode(w http.ResponseWriter, r *http.Request) {
var (
rstMsg string
offLineAddr string
limit int
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.DecommissionMetaNode))
defer func() {
doStatAndMetric(proto.DecommissionMetaNode, metric, err, nil)
}()
if offLineAddr, limit, err = parseDecomNodeReq(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.metaNode(offLineAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaNodeNotExists))
return
}
if err = m.cluster.migrateMetaNode(offLineAddr, "", limit); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
rstMsg = fmt.Sprintf("decommissionMetaNode metaNode [%v] limit %d has offline successfully", offLineAddr, limit)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) handleMetaNodeTaskResponse(w http.ResponseWriter, r *http.Request) {
var (
tr *proto.AdminTask
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.GetMetaNodeTaskResponse))
defer func() {
doStatAndMetric(proto.GetMetaNodeTaskResponse, metric, err, nil)
}()
tr, err = parseRequestToGetTaskResponse(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("%v", http.StatusOK)))
m.cluster.handleMetaNodeTaskResponse(tr.OperatorAddr, tr)
}
// Dynamically add a raft node (replica) for the master.
// By using this function, there is no need to stop all the master services. Adding a new raft node is performed online.
func (m *Server) addRaftNode(w http.ResponseWriter, r *http.Request) {
var (
id uint64
addr string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AddRaftNode))
defer func() {
doStatAndMetric(proto.AddRaftNode, metric, err, nil)
}()
var msg string
id, addr, err = parseRequestForRaftNode(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.addRaftNode(id, addr); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf("add raft node id :%v, addr:%v successfully \n", id, addr)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
// Dynamically remove a master node. Similar to addRaftNode, this operation is performed online.
func (m *Server) removeRaftNode(w http.ResponseWriter, r *http.Request) {
var (
id uint64
addr string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.RemoveRaftNode))
defer func() {
doStatAndMetric(proto.RemoveRaftNode, metric, err, nil)
}()
var msg string
id, addr, err = parseRequestForRaftNode(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
err = m.cluster.removeRaftNode(id, addr)
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg = fmt.Sprintf("remove raft node id :%v,adr:%v successfully\n", id, addr)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
// get master's raft status
func (m *Server) getRaftStatus(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.RaftStatus))
defer func() {
doStatAndMetric(proto.RaftStatus, metric, nil, nil)
}()
data := m.raftStore.RaftStatus(GroupID)
log.LogInfof("get raft status, %s", data.String())
sendOkReply(w, r, newSuccessHTTPReply(data))
}
func parseReqToDecoDisk(r *http.Request) (nodeAddr, diskPath string, diskDisable bool, limit, decommissionType int, err error) {
if err = r.ParseForm(); err != nil {
return
}
nodeAddr, err = extractNodeAddr(r)
if err != nil {
return
}
diskPath, err = extractDiskPath(r)
if err != nil {
return
}
diskDisable, err = extractDiskDisable(r)
if err != nil {
return
}
decommissionType, err = parseUintParam(r, DecommissionType)
if err != nil {
return
}
limit, err = parseUintParam(r, countKey)
if err != nil {
return
}
return
}
func parseReqToRecoDisk(r *http.Request) (nodeAddr, diskPath string, err error) {
if err = r.ParseForm(); err != nil {
return
}
nodeAddr, err = extractNodeAddr(r)
if err != nil {
return
}
diskPath, err = extractDiskPath(r)
if err != nil {
return
}
return
}
type getVolParameter struct {
name string
authKey string
skipOwnerValidation bool
}
func pareseBoolWithDefault(r *http.Request, key string, old bool) (bool, error) {
val := r.FormValue(key)
if val == "" {
return old, nil
}
newVal, err := strconv.ParseBool(val)
if err != nil {
return false, fmt.Errorf("parse %s bool val err, err %s", key, err.Error())
}
return newVal, nil
}
func parseRaftForce(r *http.Request) (bool, error) {
return pareseBoolWithDefault(r, raftForceDelKey, false)
}
func extractPosixAcl(r *http.Request) (enablePosix bool, err error) {
var value string
if value = r.FormValue(enablePosixAclKey); value == "" {
return
}
status, err := strconv.ParseBool(value)
if err != nil {
return false, fmt.Errorf("parse %s failed, val %s", enablePosixAclKey, value)
}
return status, nil
}
func (m *Server) getMetaPartitions(w http.ResponseWriter, r *http.Request) {
var (
name string
vol *Vol
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientMetaPartitions))
defer func() {
doStatAndMetric(proto.ClientMetaPartitions, metric, err, map[string]string{exporter.Vol: name})
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
mpsCache := vol.getMpsCache()
if len(mpsCache) == 0 {
vol.updateViewCache(m.cluster)
mpsCache = vol.getMpsCache()
}
send(w, r, mpsCache)
return
}
func (m *Server) putDataPartitions(w http.ResponseWriter, r *http.Request) {
var (
body []byte
name string
err error
)
defer func() {
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
}
}()
if name, err = parseAndExtractName(r); err != nil {
return
}
if err = r.ParseForm(); err != nil {
return
}
if body, err = io.ReadAll(r.Body); err != nil {
return
}
if !m.cluster.partition.IsRaftLeader() {
view := &proto.DataPartitionsView{}
if err = proto.UnmarshalHTTPReply(body, view); err != nil {
log.LogErrorf("putDataPartitions. umarshal reply.Data error volName %v", name)
return
}
m.cluster.followerReadManager.updateVolViewFromLeader(name, view)
sendOkReply(w, r, newSuccessHTTPReply("success"))
return
} else {
err = fmt.Errorf("raft leader cann't be grant dps info")
log.LogErrorf("putDataPartitions. err %v", err)
}
}
// Obtain all the data partitions in a volume.
func (m *Server) getDataPartitions(w http.ResponseWriter, r *http.Request) {
var (
body []byte
name string
compress bool
vol *Vol
err error
)
compress = r.Header.Get(proto.HeaderAcceptEncoding) == compressor.EncodingGzip
metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientDataPartitions))
defer func() {
doStatAndMetric(proto.ClientDataPartitions, metric, err, map[string]string{exporter.Vol: name})
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
log.LogInfof("action[getDataPartitions] current is leader[%v], compress[%v]",
m.cluster.partition.IsRaftLeader(), compress)
if !m.cluster.partition.IsRaftLeader() {
var ok bool
if body, ok = m.cluster.followerReadManager.getVolViewAsFollower(name, compress); !ok {
log.LogErrorf("action[getDataPartitions] volume [%v] not get partitions info", name)
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("follower volume info not found")))
return
}
if compress {
w.Header().Add(proto.HeaderContentEncoding, compressor.EncodingGzip)
}
send(w, r, body)
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if compress {
body, err = vol.getDataPartitionViewCompress()
} else {
body, err = vol.getDataPartitionsView()
}
if err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if compress {
w.Header().Add(proto.HeaderContentEncoding, compressor.EncodingGzip)
}
send(w, r, body)
}
func (m *Server) getVol(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
message string
jobj proto.APIAccessReq
ticket cryptoutil.Ticket
ts int64
param *getVolParameter
volName string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientVol))
defer func() {
doStatAndMetric(proto.ClientVol, metric, err, map[string]string{exporter.Vol: volName})
}()
if param, err = parseGetVolParameter(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
volName = param.name
if vol, err = m.cluster.getVol(param.name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if !param.skipOwnerValidation && !matchKey(vol.Owner, param.authKey) {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolAuthKeyNotMatch))
return
}
viewCache := vol.getViewCache()
if len(viewCache) == 0 {
vol.updateViewCache(m.cluster)
viewCache = vol.getViewCache()
}
if !param.skipOwnerValidation && vol.authenticate {
if jobj, ticket, ts, err = parseAndCheckTicket(r, m.cluster.MasterSecretKey, param.name); err != nil {
if err == proto.ErrExpiredTicket {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInvalidTicket, Msg: err.Error()})
return
}
if message, err = genRespMessage(viewCache, &jobj, ts, ticket.SessionKey.Key); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeMasterAPIGenRespError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(message))
} else {
send(w, r, viewCache)
}
}
// Obtain the volume information such as total capacity and used space, etc.
func (m *Server) getVolStatInfo(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
ver int
vol *Vol
byMeta bool
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientVolStat))
defer func() {
doStatAndMetric(proto.ClientVolStat, metric, err, map[string]string{exporter.Vol: name})
}()
if name, ver, byMeta, err = parseVolStatReq(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if proto.IsCold(vol.VolType) && ver != proto.LFClient {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "ec-vol is supported by LF client only"})
return
}
sendOkReply(w, r, newSuccessHTTPReply(volStat(vol, byMeta)))
}
func volStat(vol *Vol, countByMeta bool) (stat *proto.VolStatInfo) {
stat = new(proto.VolStatInfo)
stat.Name = vol.Name
stat.TotalSize = vol.Capacity * util.GB
stat.UsedSize = vol.totalUsedSpaceByMeta(countByMeta)
if stat.UsedSize > stat.TotalSize {
log.LogWarnf("vol(%v) useSize(%v) is larger than capacity(%v)", vol.Name, stat.UsedSize, stat.TotalSize)
}
stat.UsedRatio = strconv.FormatFloat(float64(stat.UsedSize)/float64(stat.TotalSize), 'f', 2, 32)
stat.DpReadOnlyWhenVolFull = vol.DpReadOnlyWhenVolFull
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
stat.InodeCount += mp.InodeCount
stat.TxCnt += mp.TxCnt
stat.TxRbInoCnt += mp.TxRbInoCnt
stat.TxRbDenCnt += mp.TxRbDenCnt
}
vol.mpsLock.RUnlock()
log.LogDebugf("total[%v],usedSize[%v]", stat.TotalSize, stat.UsedSize)
if proto.IsHot(vol.VolType) {
return
}
stat.CacheTotalSize = vol.CacheCapacity * util.GB
stat.CacheUsedSize = vol.cfsUsedSpace()
stat.CacheUsedRatio = strconv.FormatFloat(float64(stat.CacheUsedSize)/float64(stat.CacheTotalSize), 'f', 2, 32)
log.LogDebugf("ebsTotal[%v],ebsUsedSize[%v]", stat.CacheTotalSize, stat.CacheUsedSize)
return
}
func getMetaPartitionView(mp *MetaPartition) (mpView *proto.MetaPartitionView) {
mpView = proto.NewMetaPartitionView(mp.PartitionID, mp.Start, mp.End, mp.Status)
mp.RLock()
defer mp.RUnlock()
for _, host := range mp.Hosts {
mpView.Members = append(mpView.Members, host)
}
mr, err := mp.getMetaReplicaLeader()
if err != nil {
return
}
mpView.LeaderAddr = mr.Addr
mpView.MaxInodeID = mp.MaxInodeID
mpView.InodeCount = mp.InodeCount
mpView.DentryCount = mp.DentryCount
mpView.FreeListLen = mp.FreeListLen
mpView.TxCnt = mp.TxCnt
mpView.TxRbInoCnt = mp.TxRbInoCnt
mpView.TxRbDenCnt = mp.TxRbDenCnt
mpView.IsRecover = mp.IsRecover
return
}
func (m *Server) getMetaPartition(w http.ResponseWriter, r *http.Request) {
var (
err error
partitionID uint64
mp *MetaPartition
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientMetaPartition))
defer func() {
doStatAndMetric(proto.ClientMetaPartition, metric, err, nil)
}()
if partitionID, err = parseAndExtractPartitionInfo(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
return
}
toInfo := func(mp *MetaPartition) *proto.MetaPartitionInfo {
mp.RLock()
defer mp.RUnlock()
replicas := make([]*proto.MetaReplicaInfo, len(mp.Replicas))
zones := make([]string, len(mp.Hosts))
nodeSets := make([]uint64, len(mp.Hosts))
for idx, host := range mp.Hosts {
metaNode, err := m.cluster.metaNode(host)
if err == nil {
zones[idx] = metaNode.ZoneName
nodeSets[idx] = metaNode.NodeSetID
}
}
for i := 0; i < len(replicas); i++ {
replicas[i] = &proto.MetaReplicaInfo{
Addr: mp.Replicas[i].Addr,
DomainAddr: mp.Replicas[i].metaNode.DomainAddr,
MaxInodeID: mp.Replicas[i].MaxInodeID,
ReportTime: mp.Replicas[i].ReportTime,
Status: mp.Replicas[i].Status,
IsLeader: mp.Replicas[i].IsLeader,
InodeCount: mp.Replicas[i].InodeCount,
DentryCount: mp.Replicas[i].DentryCount,
MaxInode: mp.Replicas[i].MaxInodeID,
}
}
forbidden := true
vol, err := m.cluster.getVol(mp.volName)
if err == nil {
forbidden = vol.Forbidden
} else {
log.LogErrorf("action[getMetaPartition]failed to get volume %v, err %v", mp.volName, err)
}
mpInfo := &proto.MetaPartitionInfo{
PartitionID: mp.PartitionID,
Start: mp.Start,
End: mp.End,
VolName: mp.volName,
MaxInodeID: mp.MaxInodeID,
InodeCount: mp.InodeCount,
DentryCount: mp.DentryCount,
Replicas: replicas,
ReplicaNum: mp.ReplicaNum,
Status: mp.Status,
IsRecover: mp.IsRecover,
Hosts: mp.Hosts,
Peers: mp.Peers,
Zones: zones,
NodeSets: nodeSets,
MissNodes: mp.MissNodes,
OfflinePeerID: mp.OfflinePeerID,
LoadResponse: mp.LoadResponse,
Forbidden: forbidden,
}
return mpInfo
}
sendOkReply(w, r, newSuccessHTTPReply(toInfo(mp)))
}
func (m *Server) listVols(w http.ResponseWriter, r *http.Request) {
var (
err error
keywords string
vol *Vol
volsInfo []*proto.VolInfo
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminListVols))
defer func() {
doStatAndMetric(proto.AdminListVols, metric, err, nil)
}()
if keywords, err = parseKeywords(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
volsInfo = make([]*proto.VolInfo, 0)
for _, name := range m.cluster.allVolNames() {
if strings.Contains(name, keywords) {
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
stat := volStat(vol, false)
volInfo := proto.NewVolInfo(vol.Name, vol.Owner, vol.createTime, vol.status(), stat.TotalSize,
stat.UsedSize, stat.DpReadOnlyWhenVolFull)
volsInfo = append(volsInfo, volInfo)
}
}
sendOkReply(w, r, newSuccessHTTPReply(volsInfo))
}
func (m *Server) changeMasterLeader(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminChangeMasterLeader))
defer func() {
doStatAndMetric(proto.AdminChangeMasterLeader, metric, err, nil)
}()
if err = m.cluster.tryToChangeLeaderByHost(); err != nil {
log.LogErrorf("changeMasterLeader.err %v", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
rstMsg := fmt.Sprintf(" changeMasterLeader. command success send to dest host but need check. ")
_ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) OpFollowerPartitionsRead(w http.ResponseWriter, r *http.Request) {
var (
err error
enableFollower bool
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminOpFollowerPartitionsRead))
defer func() {
doStatAndMetric(proto.AdminOpFollowerPartitionsRead, metric, err, nil)
}()
log.LogDebugf("OpFollowerPartitionsRead.")
if enableFollower, err = extractStatus(r); err != nil {
log.LogErrorf("OpFollowerPartitionsRead.err %v", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
m.cluster.followerReadManager.needCheck = enableFollower
rstMsg := fmt.Sprintf(" OpFollowerPartitionsRead. set needCheck %v command success. ", enableFollower)
_ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) CreateVersion(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
name string
ver *proto.VolVersionInfo
value string
force bool
)
log.LogInfof("action[CreateVersion]")
if err = r.ParseForm(); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrParamError))
return
}
if name, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrParamError))
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if value = r.FormValue(forceKey); value != "" {
force, _ = strconv.ParseBool(value)
}
if ver, err = vol.VersionMgr.createVer2PhaseTask(m.cluster, uint64(time.Now().UnixMicro()), proto.CreateVersion, force); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(ver))
}
func (m *Server) DelVersion(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
name string
verSeq uint64
value string
force bool
)
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("volName %v not exist", name)))
return
}
if value = r.FormValue(verSeqKey); value == "" {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("verSeq not exist")))
return
}
verSeq, err = extractUint64(r, verSeqKey)
log.LogDebugf("action[DelVersion] vol %v verSeq %v", name, verSeq)
if value = r.FormValue(forceKey); value != "" {
force, _ = strconv.ParseBool(value)
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if _, err = vol.VersionMgr.createVer2PhaseTask(m.cluster, verSeq, proto.DeleteVersion, force); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply("success!"))
}
func (m *Server) GetVersionInfo(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
name string
verSeq uint64
verInfo *proto.VolVersionInfo
)
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
return
}
if verSeq, err = extractUint64(r, verSeqKey); err != nil {
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if verInfo, err = vol.VersionMgr.getVersionInfo(verSeq); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(verInfo))
}
func (m *Server) GetAllVersionInfo(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
name string
verList *proto.VolVersionInfoList
)
if err = r.ParseForm(); err != nil {
return
}
if name, err = extractName(r); err != nil {
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
//if !proto.IsHot(vol.VolType) {
// sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: "vol need be hot one"})
// return
//}
verList = vol.VersionMgr.getVersionList()
sendOkReply(w, r, newSuccessHTTPReply(verList))
}
func (m *Server) SetVerStrategy(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
strategy proto.VolumeVerStrategy
isForce bool
)
if name, err = parseVolName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if strategy, isForce, err = parseVolVerStrategy(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.cluster.SetVerStrategy(name, strategy, isForce); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
func (m *Server) getVolVer(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
info *proto.VolumeVerInfo
)
if name, err = parseVolName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if info, err = m.cluster.getVolVer(name); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(info))
}
func genRespMessage(data []byte, req *proto.APIAccessReq, ts int64, key []byte) (message string, err error) {
var (
jresp []byte
resp proto.MasterAPIAccessResp
)
resp.Data = data
resp.APIResp.Type = req.Type + 1
resp.APIResp.ClientID = req.ClientID
resp.APIResp.ServiceID = req.ServiceID
resp.APIResp.Verifier = ts + 1 // increase ts by one for client verify server
if jresp, err = json.Marshal(resp); err != nil {
err = fmt.Errorf("json marshal for response failed %s", err.Error())
return
}
if message, err = cryptoutil.EncodeMessage(jresp, key); err != nil {
err = fmt.Errorf("encdoe message for response failed %s", err.Error())
return
}
return
}
func (m *Server) associateVolWithUser(userID, volName string) error {
var err error
var userInfo *proto.UserInfo
if userInfo, err = m.user.getUserInfo(userID); err != nil && err != proto.ErrUserNotExists {
return err
}
if err == proto.ErrUserNotExists {
param := proto.UserCreateParam{
ID: userID,
Password: DefaultUserPassword,
Type: proto.UserTypeNormal,
}
if userInfo, err = m.user.createKey(¶m); err != nil {
return err
}
}
if _, err = m.user.addOwnVol(userInfo.UserID, volName); err != nil {
return err
}
return nil
}
func (m *Server) updateDecommissionLimit(w http.ResponseWriter, r *http.Request) {
var (
limit uint64
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateDecommissionLimit))
defer func() {
doStatAndMetric(proto.AdminUpdateDecommissionLimit, metric, err, nil)
}()
if limit, err = parseRequestToUpdateDecommissionLimit(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
zones := m.cluster.t.getAllZones()
for _, zone := range zones {
err = zone.updateDecommissionLimit(int32(limit), m.cluster)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
return
}
}
m.cluster.DecommissionLimit = limit
if err = m.cluster.syncPutCluster(); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
return
}
rstMsg := fmt.Sprintf("set decommission limit to %v successfully", limit)
log.LogDebugf("action[updateDecommissionLimit] %v", rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) updateDecommissionDiskFactor(w http.ResponseWriter, r *http.Request) {
var (
factor float64
err error
)
metric := exporter.NewTPCnt("req_updateDecommissionDiskFactor")
defer func() {
metric.Set(err)
}()
if factor, err = parseRequestToUpdateDecommissionDiskFactor(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
zones := m.cluster.t.getAllZones()
for _, zone := range zones {
err = zone.updateDecommissionDiskFactor(factor, m.cluster)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
return
}
}
m.cluster.DecommissionDiskFactor = factor
if err = m.cluster.syncPutCluster(); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
return
}
rstMsg := fmt.Sprintf("set decommission factor to %v successfully", factor)
log.LogDebugf("action[updateDecommissionDiskFactor] %v", rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) queryDecommissionToken(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminQueryDecommissionToken))
defer func() {
doStatAndMetric(proto.AdminQueryDecommissionToken, metric, err, nil)
}()
var stats []nodeSetDecommissionParallelStatus
zones := m.cluster.t.getAllZones()
for _, zone := range zones {
err, zoneStats := zone.queryDecommissionParallelStatus()
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
return
}
stats = append(stats, zoneStats...)
}
log.LogDebugf("action[queryDecommissionToken] %v", stats)
sendOkReply(w, r, newSuccessHTTPReply(stats))
}
func (m *Server) queryDecommissionLimit(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminQueryDecommissionLimit))
defer func() {
doStatAndMetric(proto.AdminQueryDecommissionLimit, metric, nil, nil)
}()
limit := m.cluster.DecommissionLimit
rstMsg := fmt.Sprintf("decommission limit is %v", limit)
log.LogDebugf("action[queryDecommissionLimit] %v", rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) queryDecommissionDiskLimit(w http.ResponseWriter, r *http.Request) {
var resp proto.DecommissionDiskLimit
metric := exporter.NewTPCnt("req_queryDecommissionDiskLimit")
defer func() {
metric.Set(nil)
}()
zones := m.cluster.t.getAllZones()
for _, zone := range zones {
err, diskLimit := zone.queryDecommissionDiskLimit()
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
return
}
resp.Details = append(resp.Details, diskLimit...)
}
log.LogDebugf("action[queryDecommissionDiskLimit] %v", resp)
sendOkReply(w, r, newSuccessHTTPReply(resp))
}
func (m *Server) queryDataNodeDecoProgress(w http.ResponseWriter, r *http.Request) {
var (
offLineAddr string
err error
dn *DataNode
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QueryDataNodeDecoProgress))
defer func() {
doStatAndMetric(proto.QueryDataNodeDecoProgress, metric, err, nil)
}()
if offLineAddr, err = parseReqToDecoDataNodeProgress(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dn, err = m.cluster.dataNode(offLineAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
status, progress := dn.updateDecommissionStatus(m.cluster, true)
progress, _ = FormatFloatFloor(progress, 4)
resp := &proto.DecommissionProgress{
Status: status,
Progress: fmt.Sprintf("%.2f%%", progress*float64(100)),
StatusMessage: GetDecommissionStatusMessage(status),
}
if status == DecommissionFail {
err, dps := dn.GetDecommissionFailedDPByTerm(m.cluster)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
resp.FailedDps = dps
}
sendOkReply(w, r, newSuccessHTTPReply(resp))
}
func (m *Server) queryDataNodeDecoFailedDps(w http.ResponseWriter, r *http.Request) {
var (
offLineAddr string
err error
dn *DataNode
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QueryDataNodeDecoFailedDps))
defer func() {
doStatAndMetric(proto.QueryDataNodeDecoFailedDps, metric, err, nil)
}()
if offLineAddr, err = parseReqToDecoDataNodeProgress(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if dn, err = m.cluster.dataNode(offLineAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
err, dps := dn.GetDecommissionFailedDP(m.cluster)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(dps))
}
func (m *Server) enableAutoDecommissionDisk(w http.ResponseWriter, r *http.Request) {
var (
enable bool
err error
)
metric := exporter.NewTPCnt("req_enableAutoDecommissionDisk")
defer func() {
metric.Set(err)
}()
if enable, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
m.cluster.SetAutoDecommissionDisk(enable)
if err = m.cluster.syncPutCluster(); err != nil {
sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
return
}
rstMsg := fmt.Sprintf("set auto decommission disk to %v successfully", enable)
log.LogDebugf("action[enableAutoDecommissionDisk] %v", rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func (m *Server) queryAutoDecommissionDisk(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt("req_queryAutoDecommissionDisk")
defer func() {
metric.Set(nil)
}()
enable := m.cluster.AutoDecommissionDiskIsEnabled()
rstMsg := fmt.Sprintf("auto decommission disk is %v ", enable)
log.LogDebugf("action[queryAutoDecommissionDisk] %v", rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(enable))
}
func (m *Server) queryDisableDisk(w http.ResponseWriter, r *http.Request) {
var (
node *DataNode
rstMsg string
nodeAddr string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.RecommissionDisk))
defer func() {
doStatAndMetric(proto.RecommissionDisk, metric, err, nil)
}()
if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if node, err = m.cluster.dataNode(nodeAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
return
}
disks := node.getDecommissionedDisks()
rstMsg = fmt.Sprintf("datanode[%v] disable disk[%v]",
nodeAddr, disks)
Warn(m.clusterName, rstMsg)
sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}
func parseReqToDecoDataNodeProgress(r *http.Request) (nodeAddr string, err error) {
if err = r.ParseForm(); err != nil {
return
}
nodeAddr, err = extractNodeAddr(r)
if err != nil {
return
}
return
}
func FormatFloatFloor(num float64, decimal int) (float64, error) {
d := float64(1)
if decimal > 0 {
d = math.Pow10(decimal)
}
res := strconv.FormatFloat(math.Floor(num*d)/d, 'f', -1, 64)
return strconv.ParseFloat(res, 64)
}
func (m *Server) setCheckDataReplicasEnable(w http.ResponseWriter, r *http.Request) {
var (
err error
enable bool
)
if enable, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
oldValue := m.cluster.checkDataReplicasEnable
if oldValue != enable {
m.cluster.checkDataReplicasEnable = enable
if err = m.cluster.syncPutCluster(); err != nil {
m.cluster.checkDataReplicasEnable = oldValue
log.LogErrorf("action[setCheckDataReplicasEnable] syncPutCluster failed %v", err)
sendErrReply(w, r, newErrHTTPReply(proto.ErrPersistenceByRaft))
return
}
}
log.LogInfof("action[setCheckDataReplicasEnable] enable be set [%v]", enable)
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
"set checkDataReplicasEnable to [%v] successfully", enable)))
}
func (m *Server) setFileStats(w http.ResponseWriter, r *http.Request) {
var (
err error
enable bool
)
if enable, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
oldValue := m.cluster.fileStatsEnable
m.cluster.fileStatsEnable = enable
if err = m.cluster.syncPutCluster(); err != nil {
m.cluster.fileStatsEnable = oldValue
log.LogErrorf("action[setFileStats] syncPutCluster failed %v", err)
sendErrReply(w, r, newErrHTTPReply(proto.ErrPersistenceByRaft))
return
}
log.LogInfof("action[setFileStats] enable be set [%v]", enable)
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
"set setFileStats to [%v] successfully", enable)))
}
func (m *Server) getFileStats(w http.ResponseWriter, r *http.Request) {
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
"getFileStats enable value [%v]", m.cluster.fileStatsEnable)))
}
func (m *Server) GetClusterValue(w http.ResponseWriter, r *http.Request) {
result, err := m.cluster.fsm.store.SeekForPrefix([]byte(clusterPrefix))
if err != nil {
log.LogErrorf("action[GetClusterValue],err:%v", err.Error())
sendErrReply(w, r, newErrHTTPReply(proto.ErrInternalError))
return
}
for _, value := range result {
cv := &clusterValue{}
if err = json.Unmarshal(value, cv); err != nil {
log.LogErrorf("action[GetClusterValue], unmarshal err:%v", err.Error())
sendErrReply(w, r, newErrHTTPReply(proto.ErrUnmarshalData))
return
}
sendOkReply(w, r, newSuccessHTTPReply(cv))
}
}
func (m *Server) setClusterUuidEnable(w http.ResponseWriter, r *http.Request) {
var (
err error
enable bool
)
if m.cluster.clusterUuid == "" {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: "no ClusterUuid, generate it first"})
return
}
if enable, err = parseAndExtractStatus(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
oldValue := m.cluster.clusterUuidEnable
m.cluster.clusterUuidEnable = enable
if err = m.cluster.syncPutCluster(); err != nil {
m.cluster.clusterUuidEnable = oldValue
log.LogErrorf("action[setClusterUuidEnable] syncPutCluster failed %v", err)
sendErrReply(w, r, newErrHTTPReply(proto.ErrPersistenceByRaft))
return
}
log.LogInfof("action[setClusterUuidEnable] enable be set [%v]", enable)
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
"set clusterUuIdEnable to [%v] successfully", enable)))
}
func (m *Server) generateClusterUuid(w http.ResponseWriter, r *http.Request) {
if m.cluster.clusterUuid != "" {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: "The cluster already has a ClusterUuid"})
return
}
if err := m.cluster.generateClusterUuid(); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrInternalError))
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
"generate ClusterUUID [%v] successfully", m.cluster.clusterUuid)))
}
func (m *Server) getClusterUuid(w http.ResponseWriter, r *http.Request) {
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
"ClusterUUID [%v], enable value [%v]", m.cluster.clusterUuid, m.cluster.clusterUuidEnable)))
}
func (m *Server) setConfigHandler(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetConfig))
defer func() {
doStatAndMetric(proto.AdminSetConfig, metric, err, nil)
}()
key, value, err := parseSetConfigParam(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
log.LogInfof("[setConfigHandler] set config key[%v], value[%v]", key, value)
err = m.setConfig(key, value)
if err != nil {
log.LogErrorf("[setConfigHandler] set config key[%v], value[%v], err (%s)", key, value, err.Error())
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set config key[%v], value[%v] success", key, value)))
return
}
func (m *Server) getConfigHandler(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetConfig))
defer func() {
doStatAndMetric(proto.AdminGetConfig, metric, err, nil)
}()
key, err := parseGetConfigParam(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
log.LogInfof("[getConfigHandler] get config key[%v]", key)
value, err := m.getConfig(key)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(value))
}
func (m *Server) setConfig(key string, value string) (err error) {
var metaPartitionInodeIdStep uint64
if key == cfgmetaPartitionInodeIdStep {
if metaPartitionInodeIdStep, err = strconv.ParseUint(value, 10, 64); err != nil {
return err
}
oldValue := m.config.MetaPartitionInodeIdStep
m.config.MetaPartitionInodeIdStep = metaPartitionInodeIdStep
if err = m.cluster.syncPutCluster(); err != nil {
m.config.MetaPartitionInodeIdStep = oldValue
log.LogErrorf("setConfig syncPutCluster fail err %v", err)
return err
}
} else {
err = keyNotFound("config")
}
return err
}
func (m *Server) getConfig(key string) (value string, err error) {
if key == cfgmetaPartitionInodeIdStep {
v := m.config.MetaPartitionInodeIdStep
value = strconv.FormatUint(v, 10)
} else {
err = keyNotFound("config")
}
return value, err
}
func (m *Server) CreateQuota(w http.ResponseWriter, r *http.Request) {
req := &proto.SetMasterQuotaReuqest{}
var (
err error
vol *Vol
quotaId uint32
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaCreate))
defer func() {
doStatAndMetric(proto.QuotaCreate, metric, err, map[string]string{exporter.Vol: req.VolName})
}()
if err = parserSetQuotaParam(r, req); err != nil {
log.LogErrorf("[CreateQuota] set quota fail err [%v]", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(req.VolName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if !vol.enableQuota {
err = errors.NewErrorf("vol %v disableQuota.", vol.Name)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if quotaId, err = vol.quotaManager.createQuota(req); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply("aId))
}
func (m *Server) UpdateQuota(w http.ResponseWriter, r *http.Request) {
req := &proto.UpdateMasterQuotaReuqest{}
var (
err error
vol *Vol
)
if err = parserUpdateQuotaParam(r, req); err != nil {
log.LogErrorf("[SetQuota] set quota fail err [%v]", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(req.VolName); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if !vol.enableQuota {
err = errors.NewErrorf("vol %v disableQuota.", vol.Name)
sendErrReply(w, r, newErrHTTPReply(err))
return
}
if err = vol.quotaManager.updateQuota(req); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg := fmt.Sprintf("update quota successfully, req %v", req)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) DeleteQuota(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
quotaId uint32
name string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaDelete))
defer func() {
doStatAndMetric(proto.QuotaDelete, metric, err, map[string]string{exporter.Vol: name})
}()
if name, quotaId, err = parseDeleteQuotaParam(r); err != nil {
log.LogErrorf("[DeleteQuota] del quota fail err [%v]", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if err = vol.quotaManager.deleteQuota(quotaId); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg := fmt.Sprintf("delete quota successfully, vol [%v] quotaId [%v]", name, quotaId)
sendOkReply(w, r, newSuccessHTTPReply(msg))
return
}
func (m *Server) ListQuota(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
resp *proto.ListMasterQuotaResponse
name string
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaList))
defer func() {
doStatAndMetric(proto.QuotaList, metric, err, map[string]string{exporter.Vol: name})
}()
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
resp = vol.quotaManager.listQuota()
log.LogInfof("list quota vol [%v] resp [%v] success.", name, *resp)
sendOkReply(w, r, newSuccessHTTPReply(resp))
return
}
func (m *Server) ListQuotaAll(w http.ResponseWriter, r *http.Request) {
metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaListAll))
defer func() {
doStatAndMetric(proto.QuotaListAll, metric, nil, nil)
}()
volsInfo := m.cluster.listQuotaAll()
log.LogInfof("list all vol has quota [%v]", volsInfo)
sendOkReply(w, r, newSuccessHTTPReply(volsInfo))
return
}
func (m *Server) GetQuota(w http.ResponseWriter, r *http.Request) {
var (
err error
vol *Vol
name string
quotaId uint32
quotaInfo *proto.QuotaInfo
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaGet))
defer func() {
doStatAndMetric(proto.QuotaGet, metric, err, map[string]string{exporter.Vol: name})
}()
if name, quotaId, err = parseGetQuotaParam(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if vol, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
if quotaInfo, err = vol.quotaManager.getQuota(quotaId); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
log.LogInfof("get quota vol [%v] quotaInfo [%v] success.", name, *quotaInfo)
sendOkReply(w, r, newSuccessHTTPReply(quotaInfo))
return
}
// func (m *Server) BatchModifyQuotaFullPath(w http.ResponseWriter, r *http.Request) {
// var (
// name string
// body []byte
// changeFullPathMap map[uint32]string
// err error
// vol *Vol
// )
// metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaGet))
// defer func() {
// doStatAndMetric(proto.QuotaBatchModifyPath, metric, err, map[string]string{exporter.Vol: name})
// }()
// if name, err = parseAndExtractName(r); err != nil {
// sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
// return
// }
// if body, err = io.ReadAll(r.Body); err != nil {
// sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
// return
// }
// changeFullPathMap = make(map[uint32]string)
// if err = json.Unmarshal(body, &changeFullPathMap); err != nil {
// sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
// return
// }
// if vol, err = m.cluster.getVol(name); err != nil {
// sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
// return
// }
// vol.quotaManager.batchModifyQuotaFullPath(changeFullPathMap)
// log.LogInfof("BatchModifyQuotaFullPath vol [%v] changeFullPathMap [%v] success.", name, changeFullPathMap)
// msg := fmt.Sprintf("BatchModifyQuotaFullPath successfully, vol [%v]", name)
// sendOkReply(w, r, newSuccessHTTPReply(msg))
// }
func parseSetDpDiscardParam(r *http.Request) (dpId uint64, rdOnly bool, err error) {
if err = r.ParseForm(); err != nil {
return
}
if dpId, err = extractDataPartitionID(r); err != nil {
err = fmt.Errorf("parseSetDpDiscardParam get dpid error %v", err)
return
}
val := r.FormValue(dpDiscardKey)
if val == "" {
err = fmt.Errorf("parseSetDpDiscardParam %s is empty", dpDiscardKey)
return
}
if rdOnly, err = strconv.ParseBool(val); err != nil {
err = fmt.Errorf("parseSetDpDiscardParam %s is not bool value %s", dpDiscardKey, val)
return
}
return
}
func (m *Server) setDpDiscard(partitionID uint64, isDiscard bool) (err error) {
var dp *DataPartition
if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
return fmt.Errorf("[setDpDiacard] getDataPartitionByID err(%s)", err.Error())
}
dp.Lock()
defer dp.Unlock()
if dp.IsDiscard && !isDiscard {
log.LogWarnf("[setDpDiscard] usnet dp discard flag may cause some junk data")
}
dp.IsDiscard = isDiscard
m.cluster.syncUpdateDataPartition(dp)
return
}
func (m *Server) setDpDiscardHandler(w http.ResponseWriter, r *http.Request) {
var (
dpId uint64
discard bool
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetDpDiscard))
defer func() {
doStatAndMetric(proto.AdminSetDpDiscard, metric, err, nil)
}()
dpId, discard, err = parseSetDpDiscardParam(r)
if err != nil {
log.LogInfof("[setDpDiscardHandler] set dp %v to discard(%v)", dpId, discard)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
err = m.setDpDiscard(dpId, discard)
if err != nil {
log.LogErrorf("[setDpDiscardHandler] set dp %v to discard %v, err (%s)", dpId, discard, err.Error())
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
msg := fmt.Sprintf("[setDpDiscardHandler] set dpid %v to discard(%v) success", dpId, discard)
log.LogInfo(msg)
sendOkReply(w, r, newSuccessHTTPReply(msg))
return
}
func (m *Server) getDiscardDpHandler(w http.ResponseWriter, r *http.Request) {
DiscardDpInfos := proto.DiscardDataPartitionInfos{}
metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetDiscardDp))
defer func() {
doStatAndMetric(proto.AdminGetDiscardDp, metric, nil, nil)
}()
vols := m.cluster.copyVols()
for _, vol := range vols {
var dps *DataPartitionMap
dps = vol.dataPartitions
for _, dp := range dps.partitions {
if dp.IsDiscard {
DiscardDpInfos.DiscardDps = append(DiscardDpInfos.DiscardDps, *dp.buildDpInfo(m.cluster))
}
}
}
msg := fmt.Sprintf("[GetDiscardDpHandler] discard dp num:%v", len(DiscardDpInfos.DiscardDps))
log.LogInfo(msg)
sendOkReply(w, r, newSuccessHTTPReply(DiscardDpInfos))
return
}
func (m *Server) queryBadDisks(w http.ResponseWriter, r *http.Request) {
var (
err error
infos proto.BadDiskInfos
)
metric := exporter.NewTPCnt("req_queryBadDisks")
defer func() {
metric.Set(err)
}()
m.cluster.dataNodes.Range(func(addr, node interface{}) bool {
dataNode, ok := node.(*DataNode)
if !ok {
return true
}
for _, bds := range dataNode.BadDiskStats {
info := proto.BadDiskInfo{
Address: dataNode.Addr,
Path: bds.DiskPath,
TotalPartitionCnt: bds.TotalPartitionCnt,
DiskErrPartitionList: bds.DiskErrPartitionList,
}
infos.BadDisks = append(infos.BadDisks, info)
}
return true
})
sendOkReply(w, r, newSuccessHTTPReply(infos))
}
func (m *Server) addLcNode(w http.ResponseWriter, r *http.Request) {
var (
nodeAddr string
id uint64
err error
)
if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !checkIp(nodeAddr) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
return
}
if id, err = m.cluster.addLcNode(nodeAddr); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(id))
}
// handle tasks such as heartbeat,expiration scanning, etc.
func (m *Server) handleLcNodeTaskResponse(w http.ResponseWriter, r *http.Request) {
tr, err := parseRequestToGetTaskResponse(r)
if err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("%v", http.StatusOK)))
m.cluster.handleLcNodeTaskResponse(tr.OperatorAddr, tr)
}
func (m *Server) SetBucketLifecycle(w http.ResponseWriter, r *http.Request) {
var (
bytes []byte
err error
)
if bytes, err = io.ReadAll(r.Body); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
req := proto.LcConfiguration{}
if err = json.Unmarshal(bytes, &req); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.getVol(req.VolName); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
_ = m.cluster.SetBucketLifecycle(&req)
sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("PutBucketLifecycleConfiguration successful ")))
}
func (m *Server) GetBucketLifecycle(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
lcConf *proto.LcConfiguration
)
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
lcConf = m.cluster.GetBucketLifecycle(name)
if lcConf == nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrNoSuchLifecycleConfiguration))
}
sendOkReply(w, r, newSuccessHTTPReply(lcConf))
}
func (m *Server) DelBucketLifecycle(w http.ResponseWriter, r *http.Request) {
var (
err error
name string
)
if name, err = parseAndExtractName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.getVol(name); err != nil {
sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
return
}
m.cluster.DelBucketLifecycle(name)
msg := fmt.Sprintf("delete vol[%v] lifecycle successfully", name)
log.LogWarn(msg)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) lcnodeInfo(w http.ResponseWriter, r *http.Request) {
if err := r.ParseForm(); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
switch r.FormValue("op") {
case "info":
var (
rsp *LcNodeInfoResponse
err error
)
if rsp, err = m.cluster.getAllLcNodeInfo(); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(rsp))
case "start":
if m.cluster.partition != nil && m.cluster.partition.IsRaftLeader() {
m.cluster.startLcScan()
sendOkReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeSuccess})
} else {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "not leader"})
}
default:
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "invalid op"})
}
}
func (m *Server) S3QosSet(w http.ResponseWriter, r *http.Request) {
var (
param = &proto.S3QosRequest{}
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.S3QoSSet))
defer func() {
doStatAndMetric(proto.S3QoSSet, metric, err, nil)
}()
if err = parseS3QosReq(r, param); err != nil {
log.LogErrorf("[S3QosSet] parse fail err [%v]", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !isS3QosConfigValid(param) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "s3 qos param err"})
return
}
// set s3 qos quota
if param.Quota != 0 {
if strings.ToLower(param.Uid) == proto.DefaultUid {
param.Uid = proto.DefaultUid
}
param.Api = strings.ToLower(param.Api)
metadata := new(RaftCmd)
metadata.Op = opSyncS3QosSet
key := param.Api + keySeparator + param.Uid + keySeparator + param.Type
metadata.K = S3QoSPrefix + key
metadata.V = []byte(strconv.FormatUint(param.Quota, 10))
// raft sync
if err = m.cluster.submit(metadata); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
// memory cache
m.cluster.S3ApiQosQuota.Store(metadata.K, param.Quota)
}
// set s3 node num
if param.Nodes != 0 {
metadata := new(RaftCmd)
metadata.Op = opSyncS3QosSet
key := proto.S3Nodes
metadata.K = S3QoSPrefix + key
metadata.V = []byte(strconv.FormatUint(param.Nodes, 10))
// raft sync
if err = m.cluster.submit(metadata); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
// memory cache
m.cluster.S3ApiQosQuota.Store(metadata.K, param.Nodes)
}
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
func (m *Server) S3QosGet(w http.ResponseWriter, r *http.Request) {
var err error
metric := exporter.NewTPCnt(apiToMetricsName(proto.S3QoSGet))
defer func() {
doStatAndMetric(proto.S3QoSGet, metric, err, nil)
}()
apiLimitConf := make(map[string]*proto.UserLimitConf, 0)
s3QosResponse := proto.S3QoSResponse{
ApiLimitConf: apiLimitConf,
}
// memory cache
m.cluster.S3ApiQosQuota.Range(func(key, value interface{}) bool {
k := key.(string)
v := value.(uint64)
api, uid, limitType, nodeNumKey, err := parseS3QoSKey(k)
if err != nil {
log.LogErrorf("[S3QosGet] parseS3QoSKey err [%v]", err)
return true
}
if nodeNumKey != "" {
s3QosResponse.Nodes = v
return true
}
if _, ok := apiLimitConf[api]; !ok {
bandWidthQuota := make(map[string]uint64, 0)
qpsQuota := make(map[string]uint64, 0)
concurrentQuota := make(map[string]uint64, 0)
userLimitConf := &proto.UserLimitConf{
BandWidthQuota: bandWidthQuota,
QPSQuota: qpsQuota,
ConcurrentQuota: concurrentQuota,
}
apiLimitConf[api] = userLimitConf
}
switch limitType {
case proto.FlowLimit:
apiLimitConf[api].BandWidthQuota[uid] = v
case proto.QPSLimit:
apiLimitConf[api].QPSQuota[uid] = v
case proto.ConcurrentLimit:
apiLimitConf[api].ConcurrentQuota[uid] = v
default:
// do nothing
}
return true
})
log.LogDebugf("[S3QosGet] s3qosInfoMap %+v", s3QosResponse)
sendOkReply(w, r, newSuccessHTTPReply(s3QosResponse))
}
func (m *Server) S3QosDelete(w http.ResponseWriter, r *http.Request) {
var (
param = &proto.S3QosRequest{}
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.S3QoSDelete))
defer func() {
doStatAndMetric(proto.S3QoSDelete, metric, err, nil)
}()
if err = parseS3QosReq(r, param); err != nil {
log.LogErrorf("[S3QosSet] parse fail err [%v]", err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !isS3QosConfigValid(param) {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "s3 qos param err"})
return
}
if strings.ToLower(param.Uid) == proto.DefaultUid {
param.Uid = proto.DefaultUid
}
param.Api = strings.ToLower(param.Api)
metadata := new(RaftCmd)
metadata.Op = opSyncS3QosDelete
key := param.Api + keySeparator + param.Uid + keySeparator + param.Type
metadata.K = S3QoSPrefix + key
// raft sync
if err = m.cluster.submit(metadata); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
// memory cache
m.cluster.S3ApiQosQuota.Delete(metadata.K)
sendOkReply(w, r, newSuccessHTTPReply("success"))
}
func parseS3QoSKey(key string) (api, uid, limitType, nodes string, err error) {
s3qosInfo := strings.TrimPrefix(key, S3QoSPrefix)
strs := strings.Split(s3qosInfo, keySeparator)
if len(strs) == 3 {
return strs[0], strs[1], strs[2], "", nil
}
if len(strs) == 1 && strs[0] == proto.S3Nodes {
return "", "", "", strs[0], nil
}
return "", "", "", "", errors.New("unexpected key")
}
func isS3QosConfigValid(param *proto.S3QosRequest) bool {
if param.Type != proto.FlowLimit && param.Type != proto.QPSLimit && param.Type != proto.ConcurrentLimit {
return false
}
if proto.IsS3PutApi(param.Api) {
return false
}
return true
}
package master
import (
"encoding/json"
"fmt"
"io"
"net/http"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
func (m *Server) createUser(w http.ResponseWriter, r *http.Request) {
var (
userInfo *proto.UserInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserCreate))
defer func() {
doStatAndMetric(proto.UserCreate, metric, err, nil)
}()
var bytes []byte
if bytes, err = io.ReadAll(r.Body); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
param := proto.UserCreateParam{}
if err = json.Unmarshal(bytes, ¶m); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if !ownerRegexp.MatchString(param.ID) {
sendErrReply(w, r, newErrHTTPReply(proto.ErrInvalidUserID))
return
}
if param.Type == proto.UserTypeRoot {
sendErrReply(w, r, newErrHTTPReply(proto.ErrInvalidUserType))
return
}
if userInfo, err = m.user.createKey(¶m); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
_ = sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) deleteUser(w http.ResponseWriter, r *http.Request) {
var (
userID string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserDelete))
defer func() {
doStatAndMetric(proto.UserDelete, metric, err, nil)
}()
if userID, err = parseUser(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.user.deleteKey(userID); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg := fmt.Sprintf("delete user[%v] successfully", userID)
log.LogWarn(msg)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) updateUser(w http.ResponseWriter, r *http.Request) {
var (
userInfo *proto.UserInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserUpdate))
defer func() {
doStatAndMetric(proto.UserUpdate, metric, err, nil)
}()
var bytes []byte
if bytes, err = io.ReadAll(r.Body); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
param := proto.UserUpdateParam{}
if err = json.Unmarshal(bytes, ¶m); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if param.Type == proto.UserTypeRoot {
sendErrReply(w, r, newErrHTTPReply(proto.ErrInvalidUserType))
return
}
if userInfo, err = m.user.updateKey(¶m); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
_ = sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) getUserAKInfo(w http.ResponseWriter, r *http.Request) {
var (
ak string
userInfo *proto.UserInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserGetAKInfo))
defer func() {
doStatAndMetric(proto.UserGetAKInfo, metric, err, nil)
}()
if ak, err = parseAccessKey(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if userInfo, err = m.user.getKeyInfo(ak); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) getUserInfo(w http.ResponseWriter, r *http.Request) {
var (
userID string
userInfo *proto.UserInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserGetInfo))
defer func() {
doStatAndMetric(proto.UserGetInfo, metric, err, nil)
}()
if userID, err = parseUser(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if userInfo, err = m.user.getUserInfo(userID); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) updateUserPolicy(w http.ResponseWriter, r *http.Request) {
var (
userInfo *proto.UserInfo
bytes []byte
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserUpdatePolicy))
defer func() {
doStatAndMetric(proto.UserUpdatePolicy, metric, err, nil)
}()
if bytes, err = io.ReadAll(r.Body); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
param := proto.UserPermUpdateParam{}
if err = json.Unmarshal(bytes, ¶m); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.getVol(param.Volume); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
if userInfo, err = m.user.updatePolicy(¶m); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) removeUserPolicy(w http.ResponseWriter, r *http.Request) {
var (
userInfo *proto.UserInfo
bytes []byte
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserRemovePolicy))
defer func() {
doStatAndMetric(proto.UserRemovePolicy, metric, err, nil)
}()
if bytes, err = io.ReadAll(r.Body); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
param := proto.UserPermRemoveParam{}
if err = json.Unmarshal(bytes, ¶m); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if _, err = m.cluster.getVol(param.Volume); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
if userInfo, err = m.user.removePolicy(¶m); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) deleteUserVolPolicy(w http.ResponseWriter, r *http.Request) {
var (
vol string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserDeleteVolPolicy))
defer func() {
doStatAndMetric(proto.UserDeleteVolPolicy, metric, err, map[string]string{exporter.Vol: vol})
}()
if vol, err = parseVolName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if err = m.user.deleteVolPolicy(vol); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
msg := fmt.Sprintf("delete vol[%v] policy successfully", vol)
log.LogWarn(msg)
sendOkReply(w, r, newSuccessHTTPReply(msg))
}
func (m *Server) transferUserVol(w http.ResponseWriter, r *http.Request) {
var (
bytes []byte
vol *Vol
volName string
userInfo *proto.UserInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserTransferVol))
defer func() {
doStatAndMetric(proto.UserTransferVol, metric, err, map[string]string{exporter.Vol: volName})
}()
if bytes, err = io.ReadAll(r.Body); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
param := proto.UserTransferVolParam{}
if err = json.Unmarshal(bytes, ¶m); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
volName = param.Volume
if vol, err = m.cluster.getVol(param.Volume); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
return
}
if !param.Force && vol.Owner != param.UserSrc {
sendErrReply(w, r, newErrHTTPReply(proto.ErrHaveNoPolicy))
return
}
if userInfo, err = m.user.transferVol(¶m); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
owner := vol.Owner
vol.Owner = userInfo.UserID
if err = m.cluster.syncUpdateVol(vol); err != nil {
vol.Owner = owner
err = proto.ErrPersistenceByRaft
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}
func (m *Server) getAllUsers(w http.ResponseWriter, r *http.Request) {
var (
keywords string
users []*proto.UserInfo
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UserList))
defer func() {
doStatAndMetric(proto.UserList, metric, err, nil)
}()
if keywords, err = parseKeywords(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
users = m.user.getAllUserInfo(keywords)
sendOkReply(w, r, newSuccessHTTPReply(users))
}
func (m *Server) getUsersOfVol(w http.ResponseWriter, r *http.Request) {
var (
volName string
users []string
err error
)
metric := exporter.NewTPCnt(apiToMetricsName(proto.UsersOfVol))
defer func() {
doStatAndMetric(proto.UsersOfVol, metric, err, map[string]string{exporter.Vol: volName})
}()
if volName, err = parseVolName(r); err != nil {
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
return
}
if users, err = m.user.getUsersOfVol(volName); err != nil {
sendErrReply(w, r, newErrHTTPReply(err))
return
}
sendOkReply(w, r, newSuccessHTTPReply(users))
}
func parseUser(r *http.Request) (userID string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if userID, err = extractUser(r); err != nil {
return
}
return
}
func extractUser(r *http.Request) (user string, err error) {
if user = r.FormValue(userKey); user == "" {
err = keyNotFound(userKey)
return
}
return
}
func parseAccessKey(r *http.Request) (ak string, err error) {
if err = r.ParseForm(); err != nil {
return
}
if ak, err = extractAccessKey(r); err != nil {
return
}
return
}
func parseKeywords(r *http.Request) (keywords string, err error) {
if err = r.ParseForm(); err != nil {
return
}
keywords = extractKeywords(r)
return
}
func extractAccessKey(r *http.Request) (ak string, err error) {
if ak = r.FormValue(akKey); ak == "" {
err = keyNotFound(akKey)
return
}
if !proto.AKRegexp.MatchString(ak) {
return "", errors.New("accesskey can only be number and letters")
}
return
}
func extractKeywords(r *http.Request) (keywords string) {
keywords = r.FormValue(keywordsKey)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"math"
"net/http"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"golang.org/x/time/rate"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
authSDK "github.com/cubefs/cubefs/sdk/auth"
masterSDK "github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/compressor"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// Cluster stores all the cluster-level information.
type Cluster struct {
Name string
CreateTime int64
vols map[string]*Vol
dataNodes sync.Map
metaNodes sync.Map
volMutex sync.RWMutex // volume mutex
createVolMutex sync.RWMutex // create volume mutex
mnMutex sync.RWMutex // meta node mutex
dnMutex sync.RWMutex // data node mutex
nsMutex sync.RWMutex // nodeset mutex
badPartitionMutex sync.RWMutex // BadDataPartitionIds and BadMetaPartitionIds operate mutex
leaderInfo *LeaderInfo
cfg *clusterConfig
metaReady bool
retainLogs uint64
idAlloc *IDAllocator
t *topology
dataNodeStatInfo *nodeStatInfo
metaNodeStatInfo *nodeStatInfo
zoneStatInfos map[string]*proto.ZoneStat
volStatInfo sync.Map
domainManager *DomainManager
BadDataPartitionIds *sync.Map
BadMetaPartitionIds *sync.Map
DisableAutoAllocate bool
ForbidMpDecommission bool
FaultDomain bool
needFaultDomain bool // FaultDomain is true and normal zone aleady used up
fsm *MetadataFsm
partition raftstore.Partition
MasterSecretKey []byte
lastZoneIdxForNode int
zoneIdxMux sync.Mutex //
zoneList []string
followerReadManager *followerReadManager
diskQosEnable bool
QosAcceptLimit *rate.Limiter
apiLimiter *ApiLimiter
DecommissionDisks sync.Map
DecommissionLimit uint64
EnableAutoDecommissionDisk bool
AutoDecommissionDiskMux sync.Mutex
checkAutoCreateDataPartition bool
masterClient *masterSDK.MasterClient
checkDataReplicasEnable bool
fileStatsEnable bool
clusterUuid string
clusterUuidEnable bool
inodeCountNotEqualMP *sync.Map
maxInodeNotEqualMP *sync.Map
dentryCountNotEqualMP *sync.Map
ac *authSDK.AuthClient
authenticate bool
lcNodes sync.Map
lcMgr *lifecycleManager
snapshotMgr *snapshotDelManager
DecommissionDiskFactor float64
S3ApiQosQuota *sync.Map // (api,uid,limtType) -> limitQuota
}
type followerReadManager struct {
volDataPartitionsView map[string][]byte
volDataPartitionsCompress map[string][]byte
status map[string]bool
lastUpdateTick map[string]time.Time
needCheck bool
c *Cluster
volViewMap map[string]*volValue
rwMutex sync.RWMutex
}
func newFollowerReadManager(c *Cluster) (mgr *followerReadManager) {
mgr = new(followerReadManager)
mgr.volDataPartitionsView = make(map[string][]byte)
mgr.volDataPartitionsCompress = make(map[string][]byte)
mgr.status = make(map[string]bool)
mgr.lastUpdateTick = make(map[string]time.Time)
mgr.c = c
return
}
func (mgr *followerReadManager) reSet() {
mgr.rwMutex.Lock()
defer mgr.rwMutex.Unlock()
mgr.volDataPartitionsView = make(map[string][]byte)
mgr.volDataPartitionsCompress = make(map[string][]byte)
mgr.status = make(map[string]bool)
mgr.lastUpdateTick = make(map[string]time.Time)
}
func (mgr *followerReadManager) getVolumeDpView() {
var (
err error
volViews []*volValue
view *proto.DataPartitionsView
)
if err, volViews = mgr.c.loadVolsViews(); err != nil {
panic(err)
}
mgr.rwMutex.Lock()
mgr.volViewMap = make(map[string]*volValue)
for _, vv := range volViews {
mgr.volViewMap[vv.Name] = vv
if _, ok := mgr.lastUpdateTick[vv.Name]; !ok {
// record when first discovery the volume
mgr.lastUpdateTick[vv.Name] = time.Now()
mgr.status[vv.Name] = false
}
}
mgr.rwMutex.Unlock()
if mgr.c.masterClient.Leader() == "" {
log.LogErrorf("followerReadManager.getVolumeDpView but master leader not ready")
return
}
for _, vv := range volViews {
if vv.Status == proto.VolStatusMarkDelete {
mgr.rwMutex.Lock()
mgr.lastUpdateTick[vv.Name] = time.Now()
mgr.status[vv.Name] = false
mgr.rwMutex.Unlock()
continue
}
log.LogDebugf("followerReadManager.getVolumeDpView %v", vv.Name)
if view, err = mgr.c.masterClient.ClientAPI().GetDataPartitions(vv.Name); err != nil {
log.LogErrorf("followerReadManager.getVolumeDpView %v GetDataPartitions err %v", vv.Name, err)
continue
}
mgr.updateVolViewFromLeader(vv.Name, view)
}
}
func (mgr *followerReadManager) sendFollowerVolumeDpView() {
var err error
vols := mgr.c.copyVols()
for _, vol := range vols {
log.LogDebugf("followerReadManager.getVolumeDpView %v", vol.Name)
if vol.Status == proto.VolStatusMarkDelete {
continue
}
var body []byte
if body, err = vol.getDataPartitionsView(); err != nil {
log.LogErrorf("followerReadManager.sendFollowerVolumeDpView err %v", err)
continue
}
for _, addr := range AddrDatabase {
if addr == mgr.c.leaderInfo.addr {
continue
}
mgr.c.masterClient.SetLeader(addr)
if err = mgr.c.masterClient.AdminAPI().PutDataPartitions(vol.Name, body); err != nil {
mgr.c.masterClient.SetLeader("")
log.LogErrorf("followerReadManager.sendFollowerVolumeDpView PutDataPartitions name %v addr %v err %v", vol.Name, addr, err)
continue
}
mgr.c.masterClient.SetLeader("")
log.LogDebugf("followerReadManager.sendFollowerVolumeDpView PutDataPartitions name %v addr %v err %v", vol.Name, addr, err)
}
}
}
// NOTICE: caller must correctly use mgr.rwMutex
func (mgr *followerReadManager) isVolRecordObsolete(volName string) bool {
volView, ok := mgr.volViewMap[volName]
if !ok {
// vol has been completely deleted
return true
}
if volView.Status == proto.VolStatusMarkDelete {
return true
}
return false
}
func (mgr *followerReadManager) DelObsoleteVolRecord(obsoleteVolNames map[string]struct{}) {
mgr.rwMutex.Lock()
defer mgr.rwMutex.Unlock()
for volName := range obsoleteVolNames {
log.LogDebugf("followerReadManager.DelObsoleteVolRecord, delete obsolete vol: %v", volName)
delete(mgr.volDataPartitionsView, volName)
delete(mgr.volDataPartitionsCompress, volName)
delete(mgr.status, volName)
delete(mgr.lastUpdateTick, volName)
}
}
func (mgr *followerReadManager) checkStatus() {
mgr.rwMutex.Lock()
defer mgr.rwMutex.Unlock()
timeNow := time.Now()
for volNm, lastTime := range mgr.lastUpdateTick {
if mgr.isVolRecordObsolete(volNm) {
log.LogDebugf("action[checkStatus] volume %v is obsolete, skip it", volNm)
continue
}
if lastTime.Before(timeNow.Add(-5 * time.Minute)) {
mgr.status[volNm] = false
log.LogWarnf("action[checkStatus] volume %v expired last time %v, now %v", volNm, lastTime, timeNow)
}
}
}
func (mgr *followerReadManager) updateVolViewFromLeader(key string, view *proto.DataPartitionsView) {
if !mgr.checkViewContent(key, view, true) {
log.LogErrorf("updateVolViewFromLeader. key %v checkViewContent failed status %v", key, mgr.status[key])
return
}
reply := newSuccessHTTPReply(view)
if body, err := json.Marshal(reply); err != nil {
log.LogErrorf("action[updateDpResponseCache] marshal error %v", err)
return
} else {
mgr.rwMutex.Lock()
defer mgr.rwMutex.Unlock()
mgr.volDataPartitionsView[key] = body
gzipData, err := compressor.New(compressor.EncodingGzip).Compress(body)
if err != nil {
log.LogErrorf("action[updateDpResponseCache] compress error:%+v", err)
return
}
mgr.volDataPartitionsCompress[key] = gzipData
}
mgr.status[key] = true
mgr.lastUpdateTick[key] = time.Now()
}
func (mgr *followerReadManager) checkViewContent(volName string, view *proto.DataPartitionsView, isUpdate bool) (ok bool) {
if !isUpdate && !mgr.needCheck {
return true
}
if len(view.DataPartitions) == 0 {
return true
}
for i := 0; i < len(view.DataPartitions); i++ {
dp := view.DataPartitions[i]
if len(dp.Hosts) == 0 {
log.LogErrorf("checkViewContent. dp id %v, leader %v, status %v", dp.PartitionID, dp.LeaderAddr, dp.Status)
}
}
return true
}
func (mgr *followerReadManager) getVolViewAsFollower(key string, compress bool) (value []byte, ok bool) {
mgr.rwMutex.RLock()
defer mgr.rwMutex.RUnlock()
ok = true
if compress {
value, _ = mgr.volDataPartitionsCompress[key]
} else {
value, _ = mgr.volDataPartitionsView[key]
}
log.LogDebugf("getVolViewAsFollower. volume %v return!", key)
return
}
func (mgr *followerReadManager) IsVolViewReady(volName string) bool {
mgr.rwMutex.RLock()
defer mgr.rwMutex.RUnlock()
if status, ok := mgr.status[volName]; ok {
return status
}
return false
}
func newCluster(name string, leaderInfo *LeaderInfo, fsm *MetadataFsm, partition raftstore.Partition, cfg *clusterConfig) (c *Cluster) {
c = new(Cluster)
c.Name = name
c.leaderInfo = leaderInfo
c.vols = make(map[string]*Vol, 0)
c.cfg = cfg
if c.cfg.MaxDpCntLimit == 0 {
c.cfg.MaxDpCntLimit = defaultMaxDpCntLimit
}
c.t = newTopology()
c.BadDataPartitionIds = new(sync.Map)
c.BadMetaPartitionIds = new(sync.Map)
c.dataNodeStatInfo = new(nodeStatInfo)
c.metaNodeStatInfo = new(nodeStatInfo)
c.FaultDomain = cfg.faultDomain
c.zoneStatInfos = make(map[string]*proto.ZoneStat)
c.followerReadManager = newFollowerReadManager(c)
c.fsm = fsm
c.partition = partition
c.idAlloc = newIDAllocator(c.fsm.store, c.partition)
c.domainManager = newDomainManager(c)
c.QosAcceptLimit = rate.NewLimiter(rate.Limit(c.cfg.QosMasterAcceptLimit), proto.QosDefaultBurst)
c.apiLimiter = newApiLimiter()
c.DecommissionLimit = defaultDecommissionParallelLimit
c.checkAutoCreateDataPartition = false
c.masterClient = masterSDK.NewMasterClient(nil, false)
c.inodeCountNotEqualMP = new(sync.Map)
c.maxInodeNotEqualMP = new(sync.Map)
c.dentryCountNotEqualMP = new(sync.Map)
c.lcMgr = newLifecycleManager()
c.lcMgr.cluster = c
c.snapshotMgr = newSnapshotManager()
c.snapshotMgr.cluster = c
c.S3ApiQosQuota = new(sync.Map)
return
}
func (c *Cluster) scheduleTask() {
c.scheduleToCheckDataPartitions()
c.scheduleToLoadDataPartitions()
c.scheduleToCheckReleaseDataPartitions()
c.scheduleToCheckHeartbeat()
c.scheduleToCheckMetaPartitions()
c.scheduleToUpdateStatInfo()
c.scheduleToManageDp()
c.scheduleToCheckVolStatus()
c.scheduleToCheckVolQos()
c.scheduleToCheckDiskRecoveryProgress()
c.scheduleToCheckMetaPartitionRecoveryProgress()
c.scheduleToLoadMetaPartitions()
c.scheduleToReduceReplicaNum()
c.scheduleToCheckNodeSetGrpManagerStatus()
c.scheduleToCheckFollowerReadCache()
c.scheduleToCheckDecommissionDataNode()
c.scheduleToCheckDecommissionDisk()
c.scheduleToCheckDataReplicas()
c.scheduleToLcScan()
c.scheduleToSnapshotDelVerScan()
c.scheduleToBadDisk()
}
func (c *Cluster) masterAddr() (addr string) {
return c.leaderInfo.addr
}
func (c *Cluster) tryToChangeLeaderByHost() error {
return c.partition.TryToLeader(1)
}
func (c *Cluster) scheduleToUpdateStatInfo() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.updateStatInfo()
}
time.Sleep(2 * time.Minute)
}
}()
}
func (c *Cluster) addNodeSetGrp(ns *nodeSet, load bool) (err error) {
log.LogWarnf("addNodeSetGrp nodeSet id[%v] zonename[%v] load[%v] grpManager init[%v]",
ns.ID, ns.zoneName, load, c.domainManager.init)
if c.domainManager.init {
err = c.domainManager.putNodeSet(ns, load)
c.putZoneDomain(false)
}
return
}
const (
TypeMetaPartition uint32 = 0x01
TypeDataPartition uint32 = 0x02
)
func (c *Cluster) getHostFromDomainZone(domainId uint64, createType uint32, replicaNum uint8) (hosts []string, peers []proto.Peer, err error) {
hosts, peers, err = c.domainManager.getHostFromNodeSetGrp(domainId, replicaNum, createType)
return
}
func (c *Cluster) IsLeader() bool {
if c.partition != nil {
return c.partition.IsRaftLeader()
}
return false
}
func (c *Cluster) scheduleToManageDp() {
go func() {
// check volumes after switching leader two minutes
time.Sleep(2 * time.Minute)
c.checkAutoCreateDataPartition = true
}()
// schedule delete dataPartition
go func() {
time.Sleep(2 * time.Minute)
for {
if c.partition != nil && c.partition.IsRaftLeader() {
vols := c.copyVols()
for _, vol := range vols {
if proto.IsHot(vol.VolType) {
continue
}
vol.autoDeleteDp(c)
}
}
time.Sleep(2 * time.Minute)
}
}()
}
func (c *Cluster) scheduleToCheckDataPartitions() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkDataPartitions()
}
time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
}
}()
}
func (c *Cluster) scheduleToCheckVolStatus() {
go func() {
// check vols after switching leader two minutes
for {
if c.partition.IsRaftLeader() {
vols := c.copyVols()
for _, vol := range vols {
vol.checkStatus(c)
vol.CheckStrategy(c)
}
}
time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
}
}()
}
func (c *Cluster) scheduleToCheckFollowerReadCache() {
go func() {
for {
if !c.partition.IsRaftLeader() {
c.followerReadManager.getVolumeDpView()
c.followerReadManager.checkStatus()
} else {
c.followerReadManager.sendFollowerVolumeDpView()
}
time.Sleep(5 * time.Second)
}
}()
}
func (c *Cluster) scheduleToCheckVolQos() {
go func() {
// check vols after switching leader two minutes
for {
if c.partition.IsRaftLeader() {
vols := c.copyVols()
for _, vol := range vols {
vol.checkQos()
}
}
// time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckQos))
time.Sleep(time.Duration(float32(time.Second) * 0.5))
}
}()
}
func (c *Cluster) scheduleToCheckNodeSetGrpManagerStatus() {
go func() {
for {
if c.FaultDomain == false || !c.partition.IsRaftLeader() {
time.Sleep(time.Minute)
continue
}
c.domainManager.checkAllGrpState()
c.domainManager.checkExcludeZoneState()
time.Sleep(5 * time.Second)
}
}()
}
func (c *Cluster) scheduleToLoadDataPartitions() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.doLoadDataPartitions()
}
time.Sleep(time.Second * 5)
}
}()
}
// Check the replica status of each data partition.
func (c *Cluster) checkDataPartitions() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkDataPartitions occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkDataPartitions occurred panic")
}
}()
vols := c.allVols()
for _, vol := range vols {
readWrites := vol.checkDataPartitions(c)
vol.dataPartitions.setReadWriteDataPartitions(readWrites, c.Name)
if c.metaReady {
vol.dataPartitions.updateResponseCache(true, 0, vol.VolType)
vol.dataPartitions.updateCompressCache(true, 0, vol.VolType)
}
msg := fmt.Sprintf("action[checkDataPartitions],vol[%v] can readWrite partitions:%v ",
vol.Name, vol.dataPartitions.readableAndWritableCnt)
log.LogInfo(msg)
if c.checkAutoCreateDataPartition {
vol.checkAutoDataPartitionCreation(c)
}
}
}
func (c *Cluster) doLoadDataPartitions() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("doLoadDataPartitions occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"doLoadDataPartitions occurred panic")
}
}()
vols := c.allVols()
for _, vol := range vols {
if vol.Status == proto.VolStatusMarkDelete {
continue
}
vol.loadDataPartition(c)
}
}
func (c *Cluster) scheduleToCheckReleaseDataPartitions() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.releaseDataPartitionAfterLoad()
}
time.Sleep(time.Second * defaultIntervalToFreeDataPartition)
}
}()
}
// Release the memory used for loading the data partition.
func (c *Cluster) releaseDataPartitionAfterLoad() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("releaseDataPartitionAfterLoad occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"releaseDataPartitionAfterLoad occurred panic")
}
}()
vols := c.copyVols()
for _, vol := range vols {
vol.releaseDataPartitions(c.cfg.numberOfDataPartitionsToFree, c.cfg.secondsToFreeDataPartitionAfterLoad)
}
}
func (c *Cluster) scheduleToCheckHeartbeat() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkLeaderAddr()
c.checkDataNodeHeartbeat()
// update load factor
setOverSoldFactor(c.cfg.ClusterLoadFactor)
}
time.Sleep(time.Second * defaultIntervalToCheckHeartbeat)
}
}()
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkMetaNodeHeartbeat()
}
time.Sleep(time.Second * defaultIntervalToCheckHeartbeat)
}
}()
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkLcNodeHeartbeat()
}
time.Sleep(time.Second * defaultIntervalToCheckHeartbeat)
}
}()
}
func (c *Cluster) passAclCheck(ip string) {
// do nothing
}
func (c *Cluster) checkLeaderAddr() {
leaderID, _ := c.partition.LeaderTerm()
c.leaderInfo.addr = AddrDatabase[leaderID]
}
func (c *Cluster) checkDataNodeHeartbeat() {
tasks := make([]*proto.AdminTask, 0)
c.dataNodes.Range(func(addr, dataNode interface{}) bool {
node := dataNode.(*DataNode)
node.checkLiveness()
task := node.createHeartbeatTask(c.masterAddr(), c.diskQosEnable)
hbReq := task.Request.(*proto.HeartBeatRequest)
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for _, vol := range c.vols {
if vol.Forbidden {
hbReq.ForbiddenVols = append(hbReq.ForbiddenVols, vol.Name)
}
}
tasks = append(tasks, task)
return true
})
c.addDataNodeTasks(tasks)
}
func (c *Cluster) checkMetaNodeHeartbeat() {
tasks := make([]*proto.AdminTask, 0)
c.volMutex.RLock()
defer c.volMutex.RUnlock()
c.metaNodes.Range(func(addr, metaNode interface{}) bool {
node := metaNode.(*MetaNode)
node.checkHeartbeat()
task := node.createHeartbeatTask(c.masterAddr(), c.fileStatsEnable)
hbReq := task.Request.(*proto.HeartBeatRequest)
for _, vol := range c.vols {
if vol.FollowerRead {
hbReq.FLReadVols = append(hbReq.FLReadVols, vol.Name)
}
if vol.Forbidden {
hbReq.ForbiddenVols = append(hbReq.ForbiddenVols, vol.Name)
}
if !vol.EnableAuditLog {
hbReq.DisableAuditVols = append(hbReq.DisableAuditVols, vol.Name)
}
spaceInfo := vol.uidSpaceManager.getSpaceOp()
hbReq.UidLimitInfo = append(hbReq.UidLimitInfo, spaceInfo...)
if vol.quotaManager != nil {
quotaHbInfos := vol.quotaManager.getQuotaHbInfos()
if len(quotaHbInfos) != 0 {
hbReq.QuotaHbInfos = append(hbReq.QuotaHbInfos, quotaHbInfos...)
}
}
hbReq.TxInfo = append(hbReq.TxInfo, &proto.TxInfo{
Volume: vol.Name,
Mask: vol.enableTransaction,
OpLimitVal: vol.txOpLimit,
})
}
log.LogDebugf("checkMetaNodeHeartbeat start")
for _, info := range hbReq.QuotaHbInfos {
log.LogDebugf("checkMetaNodeHeartbeat info [%v]", info)
}
tasks = append(tasks, task)
return true
})
c.addMetaNodeTasks(tasks)
}
func (c *Cluster) checkLcNodeHeartbeat() {
tasks := make([]*proto.AdminTask, 0)
diedNodes := make([]string, 0)
c.lcNodes.Range(func(addr, lcNode interface{}) bool {
node := lcNode.(*LcNode)
node.checkLiveness()
if !node.IsActive {
log.LogInfof("checkLcNodeHeartbeat: lcnode(%v) is inactive", node.Addr)
diedNodes = append(diedNodes, node.Addr)
return true
}
task := node.createHeartbeatTask(c.masterAddr())
tasks = append(tasks, task)
return true
})
c.addLcNodeTasks(tasks)
for _, node := range diedNodes {
log.LogInfof("checkLcNodeHeartbeat: deregister node(%v)", node)
_ = c.delLcNode(node)
}
return
}
func (c *Cluster) scheduleToCheckMetaPartitions() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkMetaPartitions()
}
time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
}
}()
}
func (c *Cluster) checkMetaPartitions() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkMetaPartitions occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkMetaPartitions occurred panic")
}
}()
vols := c.allVols()
for _, vol := range vols {
vol.checkMetaPartitions(c)
}
}
func (c *Cluster) scheduleToReduceReplicaNum() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkVolReduceReplicaNum()
}
time.Sleep(5 * time.Minute)
}
}()
}
func (c *Cluster) checkVolReduceReplicaNum() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkVolReduceReplicaNum occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkVolReduceReplicaNum occurred panic")
}
}()
vols := c.allVols()
for _, vol := range vols {
vol.checkReplicaNum(c)
}
}
func (c *Cluster) getInvalidIDNodes() (nodes []*InvalidNodeView) {
metaNodes := c.getNotConsistentIDMetaNodes()
nodes = append(nodes, metaNodes...)
dataNodes := c.getNotConsistentIDDataNodes()
nodes = append(nodes, dataNodes...)
return
}
func (c *Cluster) scheduleToCheckDataReplicas() {
go func() {
for {
if c.checkDataReplicasEnable {
if c.partition != nil && c.partition.IsRaftLeader() {
c.checkDataReplicas()
}
}
time.Sleep(1 * time.Minute)
}
}()
}
func (c *Cluster) checkDataReplicas() {
lackReplicaDataPartitions, _ := c.checkLackReplicaAndHostDataPartitions()
if len(lackReplicaDataPartitions) == 0 {
return
}
successCnt := 0
for _, dp := range lackReplicaDataPartitions {
if success, _ := c.autoAddDataReplica(dp); success {
successCnt += 1
}
}
failCnt := len(lackReplicaDataPartitions) - successCnt
log.LogInfof("action[checkDataReplicas] autoAddDataReplica successCnt[%v], failedCnt[%v]", successCnt, failCnt)
}
func (c *Cluster) getNotConsistentIDMetaNodes() (metaNodes []*InvalidNodeView) {
metaNodes = make([]*InvalidNodeView, 0)
c.metaNodes.Range(func(key, value interface{}) bool {
metanode, ok := value.(*MetaNode)
if !ok {
return true
}
notConsistent, oldID := c.hasNotConsistentIDMetaPartitions(metanode)
if notConsistent {
metaNodes = append(metaNodes, &InvalidNodeView{Addr: metanode.Addr, ID: metanode.ID, OldID: oldID, NodeType: "meta"})
}
return true
})
return
}
func (c *Cluster) hasNotConsistentIDMetaPartitions(metanode *MetaNode) (notConsistent bool, oldID uint64) {
safeVols := c.allVols()
for _, vol := range safeVols {
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
for _, peer := range mp.Peers {
if peer.Addr == metanode.Addr && peer.ID != metanode.ID {
vol.mpsLock.RUnlock()
return true, peer.ID
}
}
}
vol.mpsLock.RUnlock()
}
return
}
func (c *Cluster) getNotConsistentIDDataNodes() (dataNodes []*InvalidNodeView) {
dataNodes = make([]*InvalidNodeView, 0)
c.dataNodes.Range(func(key, value interface{}) bool {
datanode, ok := value.(*DataNode)
if !ok {
return true
}
notConsistent, oldID := c.hasNotConsistentIDDataPartitions(datanode)
if notConsistent {
dataNodes = append(dataNodes, &InvalidNodeView{Addr: datanode.Addr, ID: datanode.ID, OldID: oldID, NodeType: "data"})
}
return true
})
return
}
func (c *Cluster) hasNotConsistentIDDataPartitions(datanode *DataNode) (notConsistent bool, oldID uint64) {
safeVols := c.allVols()
for _, vol := range safeVols {
for _, mp := range vol.dataPartitions.partitions {
for _, peer := range mp.Peers {
if peer.Addr == datanode.Addr && peer.ID != datanode.ID {
return true, peer.ID
}
}
}
}
return
}
func (c *Cluster) updateDataNodeBaseInfo(nodeAddr string, id uint64) (err error) {
c.dnMutex.Lock()
defer c.dnMutex.Unlock()
value, ok := c.dataNodes.Load(nodeAddr)
if !ok {
err = fmt.Errorf("node %v is not exist", nodeAddr)
return
}
dataNode := value.(*DataNode)
if dataNode.ID == id {
return
}
cmds := make(map[string]*RaftCmd)
metadata, err := c.buildDeleteDataNodeCmd(dataNode)
if err != nil {
return
}
cmds[metadata.K] = metadata
dataNode.ID = id
metadata, err = c.buildUpdateDataNodeCmd(dataNode)
if err != nil {
return
}
cmds[metadata.K] = metadata
if err = c.syncBatchCommitCmd(cmds); err != nil {
return
}
// partitions := c.getAllMetaPartitionsByMetaNode(nodeAddr)
return
}
func (c *Cluster) updateMetaNodeBaseInfo(nodeAddr string, id uint64) (err error) {
c.mnMutex.Lock()
defer c.mnMutex.Unlock()
value, ok := c.metaNodes.Load(nodeAddr)
if !ok {
err = fmt.Errorf("node %v is not exist", nodeAddr)
return
}
metaNode := value.(*MetaNode)
if metaNode.ID == id {
return
}
cmds := make(map[string]*RaftCmd)
metadata, err := c.buildDeleteMetaNodeCmd(metaNode)
if err != nil {
return
}
cmds[metadata.K] = metadata
metaNode.ID = id
metadata, err = c.buildUpdateMetaNodeCmd(metaNode)
if err != nil {
return
}
cmds[metadata.K] = metadata
if err = c.syncBatchCommitCmd(cmds); err != nil {
return
}
// partitions := c.getAllMetaPartitionsByMetaNode(nodeAddr)
return
}
func (c *Cluster) addMetaNode(nodeAddr, zoneName string, nodesetId uint64) (id uint64, err error) {
c.mnMutex.Lock()
defer c.mnMutex.Unlock()
var metaNode *MetaNode
if value, ok := c.metaNodes.Load(nodeAddr); ok {
metaNode = value.(*MetaNode)
if nodesetId > 0 && nodesetId != metaNode.ID {
return metaNode.ID, fmt.Errorf("addr already in nodeset [%v]", nodeAddr)
}
return metaNode.ID, nil
}
metaNode = newMetaNode(nodeAddr, zoneName, c.Name)
zone, err := c.t.getZone(zoneName)
if err != nil {
zone = c.t.putZoneIfAbsent(newZone(zoneName))
}
var ns *nodeSet
if nodesetId > 0 {
if ns, err = zone.getNodeSet(nodesetId); err != nil {
return nodesetId, err
}
} else {
c.nsMutex.Lock()
ns = zone.getAvailNodeSetForMetaNode()
if ns == nil {
if ns, err = zone.createNodeSet(c); err != nil {
c.nsMutex.Unlock()
goto errHandler
}
}
c.nsMutex.Unlock()
}
if id, err = c.idAlloc.allocateCommonID(); err != nil {
goto errHandler
}
metaNode.ID = id
metaNode.NodeSetID = ns.ID
log.LogInfof("action[addMetaNode] metanode id[%v] zonename [%v] add meta node to nodesetid[%v]", id, zoneName, ns.ID)
if err = c.syncAddMetaNode(metaNode); err != nil {
goto errHandler
}
if err = c.syncUpdateNodeSet(ns); err != nil {
goto errHandler
}
c.t.putMetaNode(metaNode)
// nodeset be avaliable first time can be put into nodesetGrp
c.addNodeSetGrp(ns, false)
c.metaNodes.Store(nodeAddr, metaNode)
log.LogInfof("action[addMetaNode],clusterID[%v] metaNodeAddr:%v,nodeSetId[%v],capacity[%v]",
c.Name, nodeAddr, ns.ID, ns.Capacity)
return
errHandler:
err = fmt.Errorf("action[addMetaNode],clusterID[%v] metaNodeAddr:%v err:%v ",
c.Name, nodeAddr, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
func (c *Cluster) addDataNode(nodeAddr, zoneName string, nodesetId uint64) (id uint64, err error) {
c.dnMutex.Lock()
defer c.dnMutex.Unlock()
var dataNode *DataNode
if node, ok := c.dataNodes.Load(nodeAddr); ok {
dataNode = node.(*DataNode)
if nodesetId > 0 && nodesetId != dataNode.NodeSetID {
return dataNode.ID, fmt.Errorf("addr already in nodeset [%v]", nodeAddr)
}
return dataNode.ID, nil
}
dataNode = newDataNode(nodeAddr, zoneName, c.Name)
dataNode.DpCntLimit = newDpCountLimiter(&c.cfg.MaxDpCntLimit)
zone, err := c.t.getZone(zoneName)
if err != nil {
zone = c.t.putZoneIfAbsent(newZone(zoneName))
}
var ns *nodeSet
if nodesetId > 0 {
if ns, err = zone.getNodeSet(nodesetId); err != nil {
return nodesetId, err
}
} else {
c.nsMutex.Lock()
ns = zone.getAvailNodeSetForDataNode()
if ns == nil {
if ns, err = zone.createNodeSet(c); err != nil {
c.nsMutex.Unlock()
goto errHandler
}
}
c.nsMutex.Unlock()
}
// allocate dataNode id
if id, err = c.idAlloc.allocateCommonID(); err != nil {
goto errHandler
}
dataNode.ID = id
dataNode.NodeSetID = ns.ID
log.LogInfof("action[addDataNode] datanode id[%v] zonename [%v] add node to nodesetid[%v]", id, zoneName, ns.ID)
if err = c.syncAddDataNode(dataNode); err != nil {
goto errHandler
}
if err = c.syncUpdateNodeSet(ns); err != nil {
goto errHandler
}
c.t.putDataNode(dataNode)
// nodeset be avaliable first time can be put into nodesetGrp
c.addNodeSetGrp(ns, false)
c.dataNodes.Store(nodeAddr, dataNode)
log.LogInfof("action[addDataNode],clusterID[%v] dataNodeAddr:%v,nodeSetId[%v],capacity[%v]",
c.Name, nodeAddr, ns.ID, ns.Capacity)
return
errHandler:
err = fmt.Errorf("action[addDataNode],clusterID[%v] dataNodeAddr:%v err:%v ", c.Name, nodeAddr, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
func (c *Cluster) checkInactiveDataNodes() (inactiveDataNodes []string, err error) {
inactiveDataNodes = make([]string, 0)
c.dataNodes.Range(func(addr, node interface{}) bool {
dataNode := node.(*DataNode)
if !dataNode.isActive {
inactiveDataNodes = append(inactiveDataNodes, dataNode.Addr)
}
return true
})
log.LogInfof("clusterID[%v] inactiveDataNodes:%v", c.Name, inactiveDataNodes)
return
}
func (c *Cluster) checkLackReplicaAndHostDataPartitions() (lackReplicaDataPartitions []*DataPartition, err error) {
lackReplicaDataPartitions = make([]*DataPartition, 0)
vols := c.copyVols()
for _, vol := range vols {
var dps *DataPartitionMap
dps = vol.dataPartitions
for _, dp := range dps.partitions {
if dp.ReplicaNum > uint8(len(dp.Hosts)) && len(dp.Hosts) == len(dp.Replicas) && dp.IsDecommissionInitial() {
lackReplicaDataPartitions = append(lackReplicaDataPartitions, dp)
}
}
}
log.LogInfof("clusterID[%v] checkLackReplicaAndHostDataPartitions count:[%v]", c.Name, len(lackReplicaDataPartitions))
return
}
func (c *Cluster) checkLackReplicaDataPartitions() (lackReplicaDataPartitions []*DataPartition, err error) {
lackReplicaDataPartitions = make([]*DataPartition, 0)
vols := c.copyVols()
for _, vol := range vols {
var dps *DataPartitionMap
dps = vol.dataPartitions
for _, dp := range dps.partitions {
if dp.ReplicaNum > uint8(len(dp.Hosts)) {
lackReplicaDataPartitions = append(lackReplicaDataPartitions, dp)
}
}
}
log.LogInfof("clusterID[%v] lackReplicaDataPartitions count:[%v]", c.Name, len(lackReplicaDataPartitions))
return
}
func (c *Cluster) checkReplicaOfDataPartitions(ignoreDiscardDp bool) (
lackReplicaDPs []*DataPartition, unavailableReplicaDPs []*DataPartition, repFileCountDifferDps []*DataPartition,
repUsedSizeDifferDps []*DataPartition, excessReplicaDPs []*DataPartition, noLeaderDPs []*DataPartition, err error) {
noLeaderDPs = make([]*DataPartition, 0)
lackReplicaDPs = make([]*DataPartition, 0)
unavailableReplicaDPs = make([]*DataPartition, 0)
excessReplicaDPs = make([]*DataPartition, 0)
vols := c.copyVols()
for _, vol := range vols {
var dps *DataPartitionMap
dps = vol.dataPartitions
for _, dp := range dps.partitions {
if ignoreDiscardDp && dp.IsDiscard {
continue
}
if vol.Status == proto.VolStatusMarkDelete {
continue
}
if proto.IsHot(vol.VolType) {
if dp.getLeaderAddr() == "" && (time.Now().Unix()-dp.LeaderReportTime > c.cfg.DpNoLeaderReportIntervalSec) {
noLeaderDPs = append(noLeaderDPs, dp)
}
}
if dp.ReplicaNum > uint8(len(dp.Hosts)) || dp.ReplicaNum > uint8(len(dp.Replicas)) {
lackReplicaDPs = append(lackReplicaDPs, dp)
}
if (dp.GetDecommissionStatus() == DecommissionInitial || dp.GetDecommissionStatus() == DecommissionFail) &&
(uint8(len(dp.Hosts)) > dp.ReplicaNum || uint8(len(dp.Replicas)) > dp.ReplicaNum) {
excessReplicaDPs = append(excessReplicaDPs, dp)
}
repSizeDiff := 0.0
repSizeSentry := 0.0
repFileCountDiff := uint32(0)
repFileCountSentry := uint32(0)
if len(dp.Replicas) != 0 {
repSizeSentry = float64(dp.Replicas[0].Used)
repFileCountSentry = dp.Replicas[0].FileCount
}
recordReplicaUnavailable := false
for _, replica := range dp.Replicas {
if !recordReplicaUnavailable && replica.Status == proto.Unavailable {
unavailableReplicaDPs = append(unavailableReplicaDPs, dp)
recordReplicaUnavailable = true
}
if dp.IsDoingDecommission() {
continue
}
tempSizeDiff := math.Abs(float64(replica.Used) - repSizeSentry)
if tempSizeDiff > repSizeDiff {
repSizeDiff = tempSizeDiff
}
tempFileCountDiff := replica.FileCount - repFileCountSentry
if tempFileCountDiff > repFileCountDiff {
repFileCountDiff = tempFileCountDiff
}
}
if repSizeDiff > float64(c.cfg.diffReplicaSpaceUsage) {
repUsedSizeDifferDps = append(repUsedSizeDifferDps, dp)
}
if repFileCountDiff > c.cfg.diffReplicaFileCount {
repFileCountDifferDps = append(repFileCountDifferDps, dp)
}
}
}
log.LogInfof("clusterID[%v] lackReplicaDp count:[%v], unavailableReplicaDp count:[%v], "+
"repFileCountDifferDps count[%v], repUsedSizeDifferDps count[%v], "+
"excessReplicaDPs count[%v], noLeaderDPs count[%v]",
c.Name, len(lackReplicaDPs), len(unavailableReplicaDPs),
len(repFileCountDifferDps), len(repUsedSizeDifferDps),
len(excessReplicaDPs), len(noLeaderDPs))
return
}
func (c *Cluster) getDataPartitionByID(partitionID uint64) (dp *DataPartition, err error) {
vols := c.copyVols()
for _, vol := range vols {
if dp, err = vol.getDataPartitionByID(partitionID); err == nil {
return
}
}
err = dataPartitionNotFound(partitionID)
return
}
func (c *Cluster) getMetaPartitionByID(id uint64) (mp *MetaPartition, err error) {
vols := c.copyVols()
for _, vol := range vols {
if mp, err = vol.metaPartition(id); err == nil {
return
}
}
err = metaPartitionNotFound(id)
return
}
func (c *Cluster) putVol(vol *Vol) {
c.volMutex.Lock()
defer c.volMutex.Unlock()
if _, ok := c.vols[vol.Name]; !ok {
c.vols[vol.Name] = vol
}
}
func (c *Cluster) SetVerStrategy(volName string, strategy proto.VolumeVerStrategy, isForce bool) (err error) {
c.volMutex.RLock()
defer c.volMutex.RUnlock()
vol, ok := c.vols[volName]
if !ok {
err = proto.ErrVolNotExists
return
}
if !proto.IsHot(vol.VolType) {
err = fmt.Errorf("vol need be hot one")
return
}
return vol.VersionMgr.SetVerStrategy(strategy, isForce)
}
func (c *Cluster) getVolVer(volName string) (info *proto.VolumeVerInfo, err error) {
c.volMutex.RLock()
defer c.volMutex.RUnlock()
var verSeqPrepare uint64
vol, ok := c.vols[volName]
if !ok {
err = proto.ErrVolNotExists
return
}
if !proto.IsHot(vol.VolType) {
err = fmt.Errorf("vol need be hot one")
return
}
if vol.VersionMgr.enabled {
verSeqPrepare = vol.VersionMgr.prepareCommit.prepareInfo.Ver
}
var pStatus uint8
if vol.VersionMgr.prepareCommit.prepareInfo != nil {
pStatus = vol.VersionMgr.prepareCommit.prepareInfo.Status
}
info = &proto.VolumeVerInfo{
Name: volName,
VerSeq: vol.VersionMgr.verSeq,
VerSeqPrepare: verSeqPrepare,
VerPrepareStatus: pStatus,
Enabled: vol.VersionMgr.enabled,
}
return
}
func (c *Cluster) getVol(volName string) (vol *Vol, err error) {
c.volMutex.RLock()
defer c.volMutex.RUnlock()
vol, ok := c.vols[volName]
if !ok {
err = proto.ErrVolNotExists
}
return
}
func (c *Cluster) deleteVol(name string) {
c.volMutex.Lock()
defer c.volMutex.Unlock()
delete(c.vols, name)
return
}
func (c *Cluster) markDeleteVol(name, authKey string, force bool) (err error) {
var (
vol *Vol
serverAuthKey string
)
if vol, err = c.getVol(name); err != nil {
log.LogErrorf("action[markDeleteVol] err[%v]", err)
return proto.ErrVolNotExists
}
if !c.cfg.volForceDeletion {
volDentryCount := uint64(0)
mpsCopy := vol.cloneMetaPartitionMap()
for _, mp := range mpsCopy {
// to avoid latency, fetch latest mp dentry count from metanode
c.doLoadMetaPartition(mp)
mpDentryCount := uint64(0)
for _, response := range mp.LoadResponse {
if response.DentryCount > mpDentryCount {
mpDentryCount = response.DentryCount
}
}
volDentryCount += mpDentryCount
}
if volDentryCount > c.cfg.volDeletionDentryThreshold {
return fmt.Errorf("vol %s is not empty ! it's dentry count %d > dentry count deletion threshold %d, deletion not permitted ! ",
vol.Name, volDentryCount, c.cfg.volDeletionDentryThreshold)
}
}
if proto.IsCold(vol.VolType) && vol.totalUsedSpace() > 0 && !force {
return fmt.Errorf("ec-vol can't be deleted if ec used size not equal 0, now(%d)", vol.totalUsedSpace())
}
serverAuthKey = vol.Owner
if !matchKey(serverAuthKey, authKey) {
return proto.ErrVolAuthKeyNotMatch
}
vol.Status = proto.VolStatusMarkDelete
if err = c.syncUpdateVol(vol); err != nil {
vol.Status = proto.VolStatusNormal
return proto.ErrPersistenceByRaft
}
return
}
func (c *Cluster) batchCreatePreLoadDataPartition(vol *Vol, preload *DataPartitionPreLoad) (err error, dps []*DataPartition) {
if proto.IsHot(vol.VolType) {
return fmt.Errorf("vol type is not warm"), nil
}
total := overSoldCap(uint64(preload.preloadCacheCapacity))
reqCreateCount := (total-1)/(util.DefaultDataPartitionSize/util.GB) + 1
for i := 0; i < int(reqCreateCount); i++ {
log.LogInfof("create preload data partition (%v) total (%v)", i, reqCreateCount)
var dp *DataPartition
if dp, err = c.createDataPartition(vol.Name, preload); err != nil {
log.LogErrorf("create preload data partition fail: volume(%v) err(%v)", vol.Name, err)
return err, nil
}
dps = append(dps, dp)
}
return
}
func (c *Cluster) batchCreateDataPartition(vol *Vol, reqCount int, init bool) (err error) {
if !init {
if _, err = vol.needCreateDataPartition(); err != nil {
log.LogWarnf("action[batchCreateDataPartition] create data partition failed, err[%v]", err)
return
}
}
for i := 0; i < reqCount; i++ {
if c.DisableAutoAllocate {
log.LogWarn("disable auto allocate dataPartition")
return fmt.Errorf("cluster is disable auto allocate dataPartition")
}
if vol.Forbidden {
log.LogWarn("disable auto allocate dataPartition by forbidden volume")
return fmt.Errorf("volume is forbidden")
}
if _, err = c.createDataPartition(vol.Name, nil); err != nil {
log.LogErrorf("action[batchCreateDataPartition] after create [%v] data partition,occurred error,err[%v]", i, err)
break
}
}
return
}
func (c *Cluster) isFaultDomain(vol *Vol) bool {
var specifyZoneNeedDomain bool
if c.FaultDomain && !vol.crossZone && !c.needFaultDomain {
if value, ok := c.t.zoneMap.Load(vol.zoneName); ok {
if value.(*Zone).status == unavailableZone {
specifyZoneNeedDomain = true
}
}
}
log.LogInfof("action[isFaultDomain] vol [%v] zoname [%v] FaultDomain[%v] need fault domain[%v] vol crosszone[%v] default[%v] specifyZoneNeedDomain[%v] domainOn[%v]",
vol.Name, vol.zoneName, c.FaultDomain, c.needFaultDomain, vol.crossZone, vol.defaultPriority, specifyZoneNeedDomain, vol.domainOn)
domainOn := c.FaultDomain &&
(vol.domainOn ||
(!vol.crossZone && c.needFaultDomain) || specifyZoneNeedDomain ||
(vol.crossZone && (!vol.defaultPriority ||
(vol.defaultPriority && (c.needFaultDomain || len(c.t.domainExcludeZones) <= 1)))))
if !vol.domainOn && domainOn {
vol.domainOn = domainOn
// todo:(leonchang). updateView used to update domainOn status in viewCache, use channel may be better or else lock may happend
// vol.updateViewCache(c)
c.syncUpdateVol(vol)
log.LogInfof("action[isFaultDomain] vol [%v] set domainOn", vol.Name)
}
return vol.domainOn
}
// Synchronously create a data partition.
// 1. Choose one of the available data nodes.
// 2. Assign it a partition ID.
// 3. Communicate with the data node to synchronously create a data partition.
// - If succeeded, replicate the data through raft and persist it to RocksDB.
// - Otherwise, throw errors
func (c *Cluster) createDataPartition(volName string, preload *DataPartitionPreLoad) (dp *DataPartition, err error) {
log.LogInfof("action[createDataPartition] preload [%v]", preload)
var (
vol *Vol
partitionID uint64
targetHosts []string
targetPeers []proto.Peer
wg sync.WaitGroup
isPreload bool
partitionTTL int64
ok bool
)
c.volMutex.RLock()
if vol, ok = c.vols[volName]; !ok {
err = fmt.Errorf("vol %v not exist", volName)
log.LogWarnf("createDataPartition volName %v not found", volName)
c.volMutex.RUnlock()
return
}
c.volMutex.RUnlock()
dpReplicaNum := vol.dpReplicaNum
zoneName := vol.zoneName
if preload != nil {
dpReplicaNum = uint8(preload.preloadReplicaNum)
zoneName = preload.preloadZoneName
isPreload = true
partitionTTL = int64(preload.PreloadCacheTTL)*util.OneDaySec() + time.Now().Unix()
}
if vol, err = c.getVol(volName); err != nil {
return
}
vol.createDpMutex.Lock()
defer vol.createDpMutex.Unlock()
errChannel := make(chan error, dpReplicaNum)
if c.isFaultDomain(vol) {
if targetHosts, targetPeers, err = c.getHostFromDomainZone(vol.domainId, TypeDataPartition, dpReplicaNum); err != nil {
goto errHandler
}
} else {
zoneNum := c.decideZoneNum(vol.crossZone)
if targetHosts, targetPeers, err = c.getHostFromNormalZone(TypeDataPartition, nil, nil, nil,
int(dpReplicaNum), zoneNum, zoneName); err != nil {
goto errHandler
}
}
if partitionID, err = c.idAlloc.allocateDataPartitionID(); err != nil {
goto errHandler
}
dp = newDataPartition(partitionID, dpReplicaNum, volName, vol.ID, proto.GetDpType(vol.VolType, isPreload), partitionTTL)
dp.Hosts = targetHosts
dp.Peers = targetPeers
log.LogInfof("action[createDataPartition] partitionID [%v] get host [%v]", partitionID, targetHosts)
for _, host := range targetHosts {
wg.Add(1)
go func(host string) {
defer func() {
wg.Done()
}()
var diskPath string
if diskPath, err = c.syncCreateDataPartitionToDataNode(host, vol.dataPartitionSize,
dp, dp.Peers, dp.Hosts, proto.NormalCreateDataPartition, dp.PartitionType, false); err != nil {
errChannel <- err
return
}
dp.Lock()
defer dp.Unlock()
if err = dp.afterCreation(host, diskPath, c); err != nil {
errChannel <- err
}
}(host)
}
wg.Wait()
select {
case err = <-errChannel:
for _, host := range targetHosts {
wg.Add(1)
go func(host string) {
defer func() {
wg.Done()
}()
_, err := dp.getReplica(host)
if err != nil {
return
}
task := dp.createTaskToDeleteDataPartition(host)
tasks := make([]*proto.AdminTask, 0)
tasks = append(tasks, task)
c.addDataNodeTasks(tasks)
}(host)
}
wg.Wait()
goto errHandler
default:
dp.total = vol.dataPartitionSize
dp.setReadWrite()
}
if err = c.syncAddDataPartition(dp); err != nil {
goto errHandler
}
vol.dataPartitions.put(dp)
log.LogInfof("action[createDataPartition] success,volName[%v],partitionId[%v], count[%d]", volName, partitionID, len(vol.dataPartitions.partitions))
return
errHandler:
err = fmt.Errorf("action[createDataPartition],clusterID[%v] vol[%v] Err:%v ", c.Name, volName, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
func (c *Cluster) syncCreateDataPartitionToDataNode(host string, size uint64, dp *DataPartition,
peers []proto.Peer, hosts []string, createType int, partitionType int, needRollBack bool) (diskPath string, err error) {
log.LogInfof("action[syncCreateDataPartitionToDataNode] dp [%v] createtype[%v], partitionType[%v]", dp.PartitionID, createType, partitionType)
dataNode, err := c.dataNode(host)
if err != nil {
return
}
task := dp.createTaskToCreateDataPartition(host, size, peers, hosts, createType, partitionType, dataNode.getDecommissionedDisks())
var resp *proto.Packet
if resp, err = dataNode.TaskManager.syncSendAdminTask(task); err != nil {
// data node is not alive or other process error
if needRollBack {
dp.DecommissionNeedRollback = true
c.syncUpdateDataPartition(dp)
}
return
}
return string(resp.Data), nil
}
func (c *Cluster) syncCreateMetaPartitionToMetaNode(host string, mp *MetaPartition) (err error) {
hosts := make([]string, 0)
hosts = append(hosts, host)
tasks := mp.buildNewMetaPartitionTasks(hosts, mp.Peers, mp.volName)
metaNode, err := c.metaNode(host)
if err != nil {
return
}
if _, err = metaNode.Sender.syncSendAdminTask(tasks[0]); err != nil {
return
}
return
}
// decideZoneNum
// if vol is not cross zone, return 1
// if vol enable cross zone and the zone number of cluster less than defaultReplicaNum return 2
// otherwise, return defaultReplicaNum
func (c *Cluster) decideZoneNum(crossZone bool) (zoneNum int) {
if !crossZone {
return 1
}
var zoneLen int
if c.FaultDomain {
zoneLen = len(c.t.domainExcludeZones)
} else {
zoneLen = c.t.zoneLen()
}
if zoneLen < defaultReplicaNum {
zoneNum = 2
} else {
zoneNum = defaultReplicaNum
}
return zoneNum
}
func (c *Cluster) chooseZone2Plus1(zones []*Zone, excludeNodeSets []uint64, excludeHosts []string,
nodeType uint32, replicaNum int) (hosts []string, peers []proto.Peer, err error,
) {
if replicaNum < 2 || replicaNum > 3 {
return nil, nil, fmt.Errorf("action[chooseZone2Plus1] replicaNum [%v]", replicaNum)
}
zoneList := make([]*Zone, 2)
if zones[0].getSpaceLeft(nodeType) < zones[1].getSpaceLeft(nodeType) {
zoneList[0] = zones[0]
zoneList[1] = zones[1]
} else {
zoneList[0] = zones[1]
zoneList[1] = zones[0]
}
for i := 2; i < len(zones); i++ {
spaceLeft := zones[i].getSpaceLeft(nodeType)
if spaceLeft > zoneList[0].getSpaceLeft(nodeType) {
if spaceLeft > zoneList[1].getSpaceLeft(nodeType) {
zoneList[1] = zones[i]
} else {
zoneList[0] = zones[i]
}
}
}
log.LogInfof("action[chooseZone2Plus1] type [%v] after check,zone0 [%v] left [%v] zone1 [%v] left [%v]",
nodeType, zoneList[0].name, zoneList[0].getSpaceLeft(nodeType), zoneList[1].name, zoneList[1].getSpaceLeft(nodeType))
num := 1
for _, zone := range zoneList {
selectedHosts, selectedPeers, e := zone.getAvailNodeHosts(nodeType, excludeNodeSets, excludeHosts, num)
if e != nil {
log.LogErrorf("action[getHostFromNormalZone] error [%v]", e)
return nil, nil, e
}
hosts = append(hosts, selectedHosts...)
peers = append(peers, selectedPeers...)
log.LogInfof("action[chooseZone2Plus1] zone [%v] left [%v] get hosts[%v]",
zone.name, zone.getSpaceLeft(nodeType), selectedHosts)
num = replicaNum - num
}
log.LogInfof("action[chooseZone2Plus1] finally get hosts[%v]", hosts)
return hosts, peers, nil
}
func (c *Cluster) chooseZoneNormal(zones []*Zone, excludeNodeSets []uint64, excludeHosts []string,
nodeType uint32, replicaNum int) (hosts []string, peers []proto.Peer, err error) {
log.LogInfof("action[chooseZoneNormal] zones[%s] nodeType[%d] replicaNum[%d]", printZonesName(zones), nodeType, replicaNum)
c.zoneIdxMux.Lock()
defer c.zoneIdxMux.Unlock()
for i := 0; i < replicaNum; i++ {
zone := zones[c.lastZoneIdxForNode]
c.lastZoneIdxForNode = (c.lastZoneIdxForNode + 1) % len(zones)
selectedHosts, selectedPeers, err := zone.getAvailNodeHosts(nodeType, excludeNodeSets, excludeHosts, 1)
if err != nil {
log.LogErrorf("action[chooseZoneNormal] error [%v]", err)
return nil, nil, err
}
hosts = append(hosts, selectedHosts...)
peers = append(peers, selectedPeers...)
}
return
}
func (c *Cluster) getHostFromNormalZone(nodeType uint32, excludeZones []string, excludeNodeSets []uint64,
excludeHosts []string, replicaNum int,
zoneNum int, specifiedZone string) (hosts []string, peers []proto.Peer, err error,
) {
var zones []*Zone
zones = make([]*Zone, 0)
if replicaNum <= zoneNum {
zoneNum = replicaNum
}
// when creating vol,user specified a zone,we reset zoneNum to 1,to be created partition with specified zone,
// if specified zone is not writable,we choose a zone randomly
if specifiedZone != "" {
if err = c.checkNormalZoneName(specifiedZone); err != nil {
Warn(c.Name, fmt.Sprintf("cluster[%v],specified zone[%v]is found", c.Name, specifiedZone))
return
}
zoneList := strings.Split(specifiedZone, ",")
for i := 0; i < len(zoneList); i++ {
var zone *Zone
if zone, err = c.t.getZone(zoneList[i]); err != nil {
Warn(c.Name, fmt.Sprintf("cluster[%v],specified zone[%v]is found", c.Name, specifiedZone))
return
}
zones = append(zones, zone)
}
} else {
if nodeType == TypeDataPartition {
if zones, err = c.t.allocZonesForDataNode(zoneNum, replicaNum, excludeZones); err != nil {
return
}
} else {
if zones, err = c.t.allocZonesForMetaNode(zoneNum, replicaNum, excludeZones); err != nil {
return
}
}
}
if len(zones) == 1 {
log.LogInfof("action[getHostFromNormalZone] zones [%v]", zones[0].name)
if hosts, peers, err = zones[0].getAvailNodeHosts(nodeType, excludeNodeSets, excludeHosts, replicaNum); err != nil {
log.LogErrorf("action[getHostFromNormalZone],err[%v]", err)
return
}
goto result
}
hosts = make([]string, 0)
peers = make([]proto.Peer, 0)
if excludeHosts == nil {
excludeHosts = make([]string, 0)
}
if c.cfg.DefaultNormalZoneCnt == defaultNormalCrossZoneCnt && len(zones) >= defaultNormalCrossZoneCnt {
if hosts, peers, err = c.chooseZoneNormal(zones, excludeNodeSets, excludeHosts, nodeType, replicaNum); err != nil {
return
}
} else {
if hosts, peers, err = c.chooseZone2Plus1(zones, excludeNodeSets, excludeHosts, nodeType, replicaNum); err != nil {
return
}
}
result:
log.LogInfof("action[getHostFromNormalZone] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts)
if len(hosts) != replicaNum {
log.LogErrorf("action[getHostFromNormalZone] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts)
return nil, nil, errors.Trace(proto.ErrNoDataNodeToCreateDataPartition, "hosts len[%v],replicaNum[%v],zoneNum[%v],selectedZones[%v]",
len(hosts), replicaNum, zoneNum, len(zones))
}
return
}
func (c *Cluster) dataNode(addr string) (dataNode *DataNode, err error) {
value, ok := c.dataNodes.Load(addr)
if !ok {
if !c.IsLeader() {
err = errors.New("meta data for data nodes is cleared due to leader change!")
} else {
err = errors.Trace(dataNodeNotFound(addr), "%v not found", addr)
}
return
}
dataNode = value.(*DataNode)
return
}
func (c *Cluster) metaNode(addr string) (metaNode *MetaNode, err error) {
value, ok := c.metaNodes.Load(addr)
if !ok {
if !c.IsLeader() {
err = errors.New("meta data for meta nodes is cleared due to leader change!")
} else {
err = errors.Trace(metaNodeNotFound(addr), "%v not found", addr)
}
return
}
metaNode = value.(*MetaNode)
return
}
func (c *Cluster) lcNode(addr string) (lcNode *LcNode, err error) {
value, ok := c.lcNodes.Load(addr)
if !ok {
err = errors.Trace(lcNodeNotFound(addr), "%v not found", addr)
return
}
lcNode = value.(*LcNode)
return
}
func (c *Cluster) getAllDataPartitionByDataNode(addr string) (partitions []*DataPartition) {
partitions = make([]*DataPartition, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, dp := range vol.dataPartitions.partitions {
for _, host := range dp.Hosts {
if host == addr {
partitions = append(partitions, dp)
break
}
}
}
}
return
}
func (c *Cluster) getAllMetaPartitionByMetaNode(addr string) (partitions []*MetaPartition) {
partitions = make([]*MetaPartition, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
for _, host := range mp.Hosts {
if host == addr {
partitions = append(partitions, mp)
break
}
}
}
vol.mpsLock.RUnlock()
}
return
}
func (c *Cluster) getAllDataPartitionIDByDatanode(addr string) (partitionIDs []uint64) {
partitionIDs = make([]uint64, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, dp := range vol.dataPartitions.partitions {
for _, host := range dp.Hosts {
if host == addr {
partitionIDs = append(partitionIDs, dp.PartitionID)
break
}
}
}
}
return
}
func (c *Cluster) getAllMetaPartitionIDByMetaNode(addr string) (partitionIDs []uint64) {
partitionIDs = make([]uint64, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, mp := range vol.MetaPartitions {
vol.mpsLock.RLock()
for _, host := range mp.Hosts {
if host == addr {
partitionIDs = append(partitionIDs, mp.PartitionID)
break
}
}
vol.mpsLock.RUnlock()
}
}
return
}
func (c *Cluster) getAllMetaPartitionsByMetaNode(addr string) (partitions []*MetaPartition) {
partitions = make([]*MetaPartition, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, mp := range vol.MetaPartitions {
vol.mpsLock.RLock()
for _, host := range mp.Hosts {
if host == addr {
partitions = append(partitions, mp)
break
}
}
vol.mpsLock.RUnlock()
}
}
return
}
func (c *Cluster) decommissionDataNodeCancel(dataNode *DataNode) (err error, failed []uint64) {
if !dataNode.CanBePaused() {
err = fmt.Errorf("action[decommissionDataNodeCancel] dataNode[%v] status[%v] donot support cancel",
dataNode.Addr, dataNode.GetDecommissionStatus())
return
}
dataNode.SetDecommissionStatus(DecommissionPause)
// may cause progress confused for new allocated dp
dataNode.ToBeOffline = false
dataNode.DecommissionCompleteTime = time.Now().Unix()
if err = c.syncUpdateDataNode(dataNode); err != nil {
log.LogErrorf("action[decommissionDataNodeCancel] dataNode[%v] sync update failed[ %v]",
dataNode.Addr, err.Error())
return
}
for _, disk := range dataNode.DecommissionDiskList {
key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
if value, ok := c.DecommissionDisks.Load(key); ok {
dd := value.(*DecommissionDisk)
_, dps := c.decommissionDiskCancel(dd)
log.LogInfof("action[decommissionDataNodeCancel] dataNode [%s] pause disk %v with failed dp[%v]",
dataNode.Addr, dd.GenerateKey(), dps)
failed = append(failed, dps...)
}
}
log.LogDebugf("action[decommissionDataNodeCancel] dataNode[%v] cancel decommission, offline %v with failed dp[%v]",
dataNode.Addr, dataNode.ToBeOffline, failed)
return
}
func (c *Cluster) decommissionDiskCancel(disk *DecommissionDisk) (err error, failed []uint64) {
if !disk.CanBePaused() {
err = fmt.Errorf("action[decommissionDiskCancel] dataNode[%v] disk[%s] status[%v] donot support cancel",
disk.SrcAddr, disk.SrcAddr, disk.GetDecommissionStatus())
return
}
disk.SetDecommissionStatus(DecommissionPause)
// disk.DecommissionDpTotal = 0
if err = c.syncUpdateDecommissionDisk(disk); err != nil {
log.LogErrorf("action[decommissionDiskCancel] dataNode[%v] disk[%s] sync update failed[ %v]",
disk.SrcAddr, disk.SrcAddr, err.Error())
return
}
partitions := disk.GetLatestDecommissionDP(c)
dpIds := make([]uint64, 0)
for _, dp := range partitions {
if !dp.PauseDecommission(c) {
failed = append(failed, dp.PartitionID)
}
dpIds = append(dpIds, dp.PartitionID)
}
log.LogDebugf("action[decommissionDiskCancel] dataNode[%v] disk[%s] cancel decommission dps[%v] with failed [%v]",
disk.SrcAddr, disk.SrcAddr, dpIds, failed)
return
}
func (c *Cluster) migrateDataNode(srcAddr, targetAddr string, raftForce bool, limit int) (err error) {
msg := fmt.Sprintf("action[migrateDataNode], src(%s) migrate to target(%s) raftForcs(%v) limit(%v)",
srcAddr, targetAddr, raftForce, limit)
log.LogWarn(msg)
srcNode, err := c.dataNode(srcAddr)
if err != nil {
return
}
if !srcNode.canMarkDecommission() {
err = fmt.Errorf("migrate src(%v) is still on working, please wait,check or cancel if abnormal:%v",
srcAddr, srcNode.GetDecommissionStatus())
log.LogWarnf("action[migrateDataNode] %v", err)
return
}
srcNode.markDecommission(targetAddr, raftForce, limit)
c.syncUpdateDataNode(srcNode)
log.LogInfof("action[migrateDataNode] %v return now", srcAddr)
return
}
func (c *Cluster) decommissionDataNode(dataNode *DataNode, force bool) (err error) {
return c.migrateDataNode(dataNode.Addr, "", false, 0)
}
func (c *Cluster) delDataNodeFromCache(dataNode *DataNode) {
c.dataNodes.Delete(dataNode.Addr)
c.t.deleteDataNode(dataNode)
go dataNode.clean()
}
func (c *Cluster) delDecommissionDiskFromCache(dd *DecommissionDisk) {
c.DecommissionDisks.Delete(dd.GenerateKey())
}
func (c *Cluster) decommissionSingleDp(dp *DataPartition, newAddr, offlineAddr string) (err error) {
var (
dataNode *DataNode
decommContinue = false
newReplica *DataReplica
)
ticker := time.NewTicker(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
defer func() {
ticker.Stop()
}()
// 1. add new replica first
if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionEnter {
if err = c.addDataReplica(dp, newAddr); err != nil {
err = fmt.Errorf("action[decommissionSingleDp] dp %v addDataReplica fail err %v", dp.PartitionID, err)
goto ERR
}
// if addDataReplica is success, can add to BadDataPartitionIds
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionWaitAddRes)
dp.SetDecommissionStatus(DecommissionRunning)
dp.isRecover = true
dp.Status = proto.ReadOnly
dp.RecoverStartTime = time.Now()
c.syncUpdateDataPartition(dp)
c.putBadDataPartitionIDsByDiskPath(dp.DecommissionSrcDiskPath, dp.DecommissionSrcAddr, dp.PartitionID)
log.LogWarnf("action[decommissionSingleDp] dp %v start wait add replica %v", dp.PartitionID, newAddr)
}
// 2. wait for repair
if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionWaitAddRes {
for {
select {
case decommContinue = <-dp.SpecialReplicaDecommissionStop: //
if !decommContinue {
err = fmt.Errorf("action[decommissionSingleDp] dp %v wait addDataReplica is stopped", dp.PartitionID)
dp.SetDecommissionStatus(DecommissionPause)
log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
goto ERR
}
case <-ticker.C:
if !c.partition.IsRaftLeader() {
err = fmt.Errorf("action[decommissionSingleDp] dp %v wait addDataReplica result addr %v master leader changed", dp.PartitionID, newAddr)
log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
goto ERR
}
}
// check new replica status
liveReplicas := dp.getLiveReplicasFromHosts(c.cfg.DataPartitionTimeOutSec)
newReplica, err = dp.getReplica(newAddr)
if err != nil {
err = fmt.Errorf("action[decommissionSingleDp] dp %v replica %v not found",
dp.PartitionID, newAddr)
log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
goto ERR
}
if len(liveReplicas) == int(dp.ReplicaNum+1) {
log.LogInfof("action[decommissionSingleDp] dp %v replica[%v] status %v",
dp.PartitionID, newReplica.Addr, newReplica.Status)
if newReplica.isRepairing() { // wait for repair
if time.Now().Sub(dp.RecoverStartTime) > c.GetDecommissionDataPartitionRecoverTimeOut() {
err = fmt.Errorf("action[decommissionSingleDp] dp %v new replica %v repair time out",
dp.PartitionID, newAddr)
dp.DecommissionNeedRollback = true
newReplica.Status = proto.Unavailable // remove from data partition check
log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
goto ERR
}
continue
} else if newReplica.isUnavailable() { // repair failed,need rollback
err = fmt.Errorf("action[decommissionSingleDp] dp %v new replica %v is Unavailable",
dp.PartitionID, newAddr)
dp.DecommissionNeedRollback = true
log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
goto ERR
} else {
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionWaitAddResFin)
c.syncUpdateDataPartition(dp)
log.LogInfof("action[decommissionSingleDp] dp %v add replica success", dp.PartitionID)
break
}
}
}
}
// 2. wait for leader
if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionWaitAddResFin {
if !c.partition.IsRaftLeader() {
err = fmt.Errorf("action[decommissionSingleDp] dp %v wait addDataReplica result addr %v master leader changed", dp.PartitionID, newAddr)
goto ERR
}
if dataNode, err = c.dataNode(newAddr); err != nil {
err = fmt.Errorf("action[decommissionSingleDp] dp %v get offlineAddr %v err %v", dp.PartitionID, newAddr, err)
goto ERR
}
times := 0
for {
// if leader is selected
if dp.getLeaderAddr() != "" {
break
}
log.LogInfof("action[decommissionSingleDp] dp %v try tryToChangeLeader addr %v", dp.PartitionID, newAddr)
if err = dp.tryToChangeLeader(c, dataNode); err != nil {
log.LogWarnf("action[decommissionSingleDp] dp %v ChangeLeader to addr %v err %v", dp.PartitionID, newAddr, err)
}
select {
case <-ticker.C:
if !c.partition.IsRaftLeader() {
err = fmt.Errorf("action[decommissionSingleDp] dp %v wait tryToChangeLeader addr %v master leader changed", dp.PartitionID, newAddr)
goto ERR
}
times++
if times == 60 {
err = fmt.Errorf("action[decommissionSingleDp] dp %v wait leader selection new addr %v timeout", dp.PartitionID, newAddr)
goto ERR
}
case decommContinue = <-dp.SpecialReplicaDecommissionStop:
if !decommContinue {
err = fmt.Errorf("action[decommissionSingleDp] dp %v wait for leader selection is stopped", dp.PartitionID)
dp.SetDecommissionStatus(DecommissionPause)
goto ERR
}
}
}
log.LogInfof("action[decommissionSingleDp] dp %v try removeDataReplica %v", dp.PartitionID, offlineAddr)
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionRemoveOld)
c.syncUpdateDataPartition(dp)
}
// 3.delete offline replica
if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionRemoveOld {
if err = c.removeDataReplica(dp, offlineAddr, false, false); err != nil {
err = fmt.Errorf("action[decommissionSingleDp] dp %v err %v", dp.PartitionID, err)
goto ERR
}
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
dp.SetDecommissionStatus(DecommissionSuccess)
c.syncUpdateDataPartition(dp)
log.LogInfof("action[decommissionSingleDp] dp %v success", dp.PartitionID)
return
}
log.LogWarnf("action[decommissionSingleDp] dp %v unexpect end: %v", dp.PartitionID, dp.GetSpecialReplicaDecommissionStep())
return nil
ERR:
log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
return err
}
func (c *Cluster) autoAddDataReplica(dp *DataPartition) (success bool, err error) {
var (
targetHosts []string
newAddr string
vol *Vol
zone *Zone
ns *nodeSet
)
success = false
dp.RLock()
// not support
if dp.isSpecialReplicaCnt() {
dp.RUnlock()
return
}
dp.RUnlock()
// not support
if !proto.IsNormalDp(dp.PartitionType) {
return
}
var ok bool
if vol, ok = c.vols[dp.VolName]; !ok {
log.LogWarnf("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] vol not exist, PersistenceHosts:[%v]",
c.Name, dp.VolName, dp.PartitionID, dp.Hosts)
return
}
// not support
if c.isFaultDomain(vol) {
return
}
if vol.crossZone {
zones := dp.getZones()
if targetHosts, _, err = c.getHostFromNormalZone(TypeDataPartition, zones, nil, dp.Hosts, 1, 1, ""); err != nil {
goto errHandler
}
} else {
if zone, err = c.t.getZone(vol.zoneName); err != nil {
log.LogWarnf("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] zone not exist, PersistenceHosts:[%v]",
c.Name, dp.VolName, dp.PartitionID, dp.Hosts)
return
}
nodeSets := dp.getNodeSets()
if len(nodeSets) != 1 {
log.LogWarnf("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] the number of nodeSets is not one, PersistenceHosts:[%v]",
c.Name, dp.VolName, dp.PartitionID, dp.Hosts)
return
}
if ns, err = zone.getNodeSet(nodeSets[0]); err != nil {
goto errHandler
}
if targetHosts, _, err = ns.getAvailDataNodeHosts(dp.Hosts, 1); err != nil {
goto errHandler
}
}
newAddr = targetHosts[0]
if err = c.addDataReplica(dp, newAddr); err != nil {
goto errHandler
}
dp.Status = proto.ReadOnly
dp.isRecover = true
c.putBadDataPartitionIDs(nil, newAddr, dp.PartitionID)
dp.RLock()
c.syncUpdateDataPartition(dp)
dp.RUnlock()
log.LogInfof("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] auto add data replica success, newReplicaHost[%v], PersistenceHosts:[%v]",
c.Name, dp.VolName, dp.PartitionID, newAddr, dp.Hosts)
success = true
return
errHandler:
if err != nil {
err = fmt.Errorf("clusterID[%v] vol[%v] partitionID[%v], err[%v]", c.Name, dp.VolName, dp.PartitionID, err)
log.LogErrorf("action[autoAddDataReplica] err %v", err)
}
return
}
// Decommission a data partition.
// 1. Check if we can decommission a data partition. In the following cases, we are not allowed to do so:
// - (a) a replica is not in the latest host list;
// - (b) there is already a replica been taken offline;
// - (c) the remaining number of replicas is less than the majority
// 2. Choose a new data node.
// 3. synchronized decommission data partition
// 4. synchronized create a new data partition
// 5. Set the data partition as readOnly.
// 6. persistent the new host list
func (c *Cluster) migrateDataPartition(srcAddr, targetAddr string, dp *DataPartition, raftForce bool, errMsg string) (err error) {
var (
targetHosts []string
newAddr string
msg string
dataNode *DataNode
zone *Zone
replica *DataReplica
ns *nodeSet
excludeNodeSets []uint64
zones []string
)
log.LogDebugf("[migrateDataPartition] src %v target %v raftForce %v", srcAddr, targetAddr, raftForce)
dp.RLock()
if ok := dp.hasHost(srcAddr); !ok {
dp.RUnlock()
return
}
if dp.isSpecialReplicaCnt() {
if dp.GetSpecialReplicaDecommissionStep() >= SpecialDecommissionInitial {
err = fmt.Errorf("volume [%v] dp [%v] is on decommission", dp.VolName, dp.PartitionID)
log.LogErrorf("action[decommissionDataPartition][%v] ", err)
dp.RUnlock()
return
}
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
}
replica, _ = dp.getReplica(srcAddr)
dp.RUnlock()
// delete if not normal data partition
if !proto.IsNormalDp(dp.PartitionType) {
c.vols[dp.VolName].deleteDataPartition(c, dp)
return
}
if err = c.validateDecommissionDataPartition(dp, srcAddr); err != nil {
goto errHandler
}
if dataNode, err = c.dataNode(srcAddr); err != nil {
goto errHandler
}
if dataNode.ZoneName == "" {
err = fmt.Errorf("dataNode[%v] zone is nil", dataNode.Addr)
goto errHandler
}
if zone, err = c.t.getZone(dataNode.ZoneName); err != nil {
goto errHandler
}
if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil {
goto errHandler
}
if targetAddr != "" {
targetHosts = []string{targetAddr}
} else if targetHosts, _, err = ns.getAvailDataNodeHosts(dp.Hosts, 1); err != nil {
if _, ok := c.vols[dp.VolName]; !ok {
log.LogWarnf("clusterID[%v] partitionID:%v on node:%v offline failed,PersistenceHosts:[%v]",
c.Name, dp.PartitionID, srcAddr, dp.Hosts)
goto errHandler
}
if c.isFaultDomain(c.vols[dp.VolName]) {
log.LogErrorf("clusterID[%v] partitionID:%v on node:%v is banlance zone,PersistenceHosts:[%v]",
c.Name, dp.PartitionID, srcAddr, dp.Hosts)
goto errHandler
}
// select data nodes from the other node set in same zone
excludeNodeSets = append(excludeNodeSets, ns.ID)
if targetHosts, _, err = zone.getAvailNodeHosts(TypeDataPartition, excludeNodeSets, dp.Hosts, 1); err != nil {
// select data nodes from the other zone
zones = dp.getLiveZones(srcAddr)
var excludeZone []string
if len(zones) == 0 {
excludeZone = append(excludeZone, zone.name)
} else {
excludeZone = append(excludeZone, zones[0])
}
if targetHosts, _, err = c.getHostFromNormalZone(TypeDataPartition, excludeZone, excludeNodeSets, dp.Hosts, 1, 1, ""); err != nil {
goto errHandler
}
}
}
newAddr = targetHosts[0]
err = c.updateDataNodeSize(newAddr, dp)
if err != nil {
log.LogErrorf("action[migrateDataPartition] target addr can't be writable, add %s %s", newAddr, err.Error())
return
}
defer func() {
if err != nil {
c.returnDataSize(newAddr, dp)
}
}()
// if special replica wait for
if dp.ReplicaNum == 1 || (dp.ReplicaNum == 2 && (dp.ReplicaNum == c.vols[dp.VolName].dpReplicaNum) && !raftForce) {
dp.Status = proto.ReadOnly
dp.isRecover = true
c.putBadDataPartitionIDs(replica, srcAddr, dp.PartitionID)
if err = c.decommissionSingleDp(dp, newAddr, srcAddr); err != nil {
goto errHandler
}
} else {
if err = c.removeDataReplica(dp, srcAddr, false, raftForce); err != nil {
goto errHandler
}
if err = c.addDataReplica(dp, newAddr); err != nil {
goto errHandler
}
dp.Status = proto.ReadOnly
dp.isRecover = true
c.putBadDataPartitionIDs(replica, srcAddr, dp.PartitionID)
}
log.LogDebugf("[migrateDataPartition] src %v target %v raftForce %v", srcAddr, targetAddr, raftForce)
dp.RLock()
c.syncUpdateDataPartition(dp)
dp.RUnlock()
log.LogWarnf("[migrateDataPartition] clusterID[%v] partitionID:%v on node:%v offline success,newHost[%v],PersistenceHosts:[%v]",
c.Name, dp.PartitionID, srcAddr, newAddr, dp.Hosts)
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
return
errHandler:
if dp.isSpecialReplicaCnt() {
if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionEnter {
dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
}
}
msg = fmt.Sprintf(errMsg+" clusterID[%v] partitionID:%v on Node:%v "+
"Then Fix It on newHost:%v Err:%v , PersistenceHosts:%v ",
c.Name, dp.PartitionID, srcAddr, newAddr, err, dp.Hosts)
if err != nil {
Warn(c.Name, msg)
err = fmt.Errorf("vol[%v],partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
log.LogErrorf("actin[decommissionDataPartition] err %v", err)
}
return
}
// Decommission a data partition.
// 1. Check if we can decommission a data partition. In the following cases, we are not allowed to do so:
// - (a) a replica is not in the latest host list;
// - (b) there is already a replica been taken offline;
// - (c) the remaining number of replicas is less than the majority
// 2. Choose a new data node.
// 3. synchronized decommission data partition
// 4. synchronized create a new data partition
// 5. Set the data partition as readOnly.
// 6. persistent the new host list
func (c *Cluster) decommissionDataPartition(offlineAddr string, dp *DataPartition, raftForce bool, errMsg string) (err error) {
return c.migrateDataPartition(offlineAddr, "", dp, raftForce, errMsg)
}
func (c *Cluster) validateDecommissionDataPartition(dp *DataPartition, offlineAddr string) (err error) {
dp.RLock()
defer dp.RUnlock()
var vol *Vol
if vol, err = c.getVol(dp.VolName); err != nil {
log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
return
}
if err = dp.hasMissingOneReplica(offlineAddr, int(vol.dpReplicaNum)); err != nil {
log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
return
}
// if the partition can be offline or not
if err = dp.canBeOffLine(offlineAddr); err != nil {
log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
return
}
if dp.isRecover && !dp.activeUsedSimilar() {
err = fmt.Errorf("vol[%v],data partition[%v] is recovering,[%v] can't be decommissioned", vol.Name, dp.PartitionID, offlineAddr)
log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
return
}
log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v looks fine!", dp.VolName, dp.PartitionID)
return
}
func (c *Cluster) addDataReplica(dp *DataPartition, addr string) (err error) {
defer func() {
if err != nil {
log.LogErrorf("action[addDataReplica],vol[%v],dp %v ,err[%v]", dp.VolName, dp.PartitionID, err)
} else {
log.LogInfof("action[addDataReplica] dp %v add replica dst addr %v success!", dp.PartitionID, addr)
}
}()
log.LogInfof("action[addDataReplica] dp %v try add replica dst addr %v try add raft member", dp.PartitionID, addr)
dp.addReplicaMutex.Lock()
defer dp.addReplicaMutex.Unlock()
dataNode, err := c.dataNode(addr)
if err != nil {
return
}
addPeer := proto.Peer{ID: dataNode.ID, Addr: addr}
if !proto.IsNormalDp(dp.PartitionType) {
return fmt.Errorf("action[addDataReplica] [%d] is not normal dp, not support add or delete replica", dp.PartitionID)
}
log.LogInfof("action[addDataReplica] dp %v dst addr %v try add raft member, node id %v", dp.PartitionID, addr, dataNode.ID)
if err = c.addDataPartitionRaftMember(dp, addPeer); err != nil {
log.LogWarnf("action[addDataReplica] dp %v addr %v try add raft member err [%v]", dp.PartitionID, addr, err)
return
}
log.LogInfof("action[addDataReplica] dp %v addr %v try create data replica", dp.PartitionID, addr)
if err = c.createDataReplica(dp, addPeer); err != nil {
log.LogWarnf("action[addDataReplica] dp %v addr %v createDataReplica err [%v]", dp.PartitionID, addr, err)
return
}
return
}
// update datanode size with to replica size
func (c *Cluster) updateDataNodeSize(addr string, dp *DataPartition) error {
leaderSize := dp.Replicas[0].Used
dataNode, err := c.dataNode(addr)
if err != nil {
return err
}
dataNode.Lock()
defer dataNode.Unlock()
if dataNode.AvailableSpace < 10*util.GB {
return fmt.Errorf("new datanode %s is not writable %d", addr, dataNode.AvailableSpace)
}
dataNode.LastUpdateTime = time.Now()
if dataNode.AvailableSpace < leaderSize {
dataNode.AvailableSpace = 0
return nil
}
dataNode.AvailableSpace -= leaderSize
return nil
}
func (c *Cluster) returnDataSize(addr string, dp *DataPartition) {
leaderSize := dp.Replicas[0].Used
dataNode, err := c.dataNode(addr)
if err != nil {
return
}
dataNode.Lock()
defer dataNode.Unlock()
log.LogWarnf("returnDataSize after error, addr %s, ava %d, leader %d", addr, dataNode.AvailableSpace, leaderSize)
dataNode.LastUpdateTime = time.Now()
dataNode.AvailableSpace += leaderSize
}
func (c *Cluster) buildAddDataPartitionRaftMemberTaskAndSyncSendTask(dp *DataPartition, addPeer proto.Peer, leaderAddr string) (resp *proto.Packet, err error) {
log.LogInfof("action[buildAddDataPartitionRaftMemberTaskAndSyncSendTask] add peer [%v] start", addPeer)
defer func() {
var resultCode uint8
if resp != nil {
resultCode = resp.ResultCode
}
if err != nil {
log.LogErrorf("vol[%v],data partition[%v],resultCode[%v],err[%v]", dp.VolName, dp.PartitionID, resultCode, err)
} else {
log.LogWarnf("vol[%v],data partition[%v],resultCode[%v],err[%v]", dp.VolName, dp.PartitionID, resultCode, err)
}
}()
task, err := dp.createTaskToAddRaftMember(addPeer, leaderAddr)
if err != nil {
return
}
leaderDataNode, err := c.dataNode(leaderAddr)
if err != nil {
return
}
if resp, err = leaderDataNode.TaskManager.syncSendAdminTask(task); err != nil {
return
}
log.LogInfof("action[buildAddDataPartitionRaftMemberTaskAndSyncSendTask] add peer [%v] finished", addPeer)
return
}
func (c *Cluster) addDataPartitionRaftMember(dp *DataPartition, addPeer proto.Peer) (err error) {
var (
candidateAddrs []string
leaderAddr string
)
if leaderAddr, candidateAddrs, err = dp.prepareAddRaftMember(addPeer); err != nil {
// maybe already add success before(master has updated hosts)
return nil
}
dp.Lock()
oldHosts := make([]string, len(dp.Hosts))
copy(oldHosts, dp.Hosts)
oldPeers := make([]proto.Peer, len(dp.Peers))
copy(oldPeers, dp.Peers)
dp.Hosts = append(dp.Hosts, addPeer.Addr)
dp.Peers = append(dp.Peers, addPeer)
dp.Unlock()
// send task to leader addr first,if need to retry,then send to other addr
for index, host := range candidateAddrs {
if leaderAddr == "" && len(candidateAddrs) < int(dp.ReplicaNum) {
time.Sleep(retrySendSyncTaskInternal)
}
_, err = c.buildAddDataPartitionRaftMemberTaskAndSyncSendTask(dp, addPeer, host)
if err == nil {
break
}
if index < len(candidateAddrs)-1 {
time.Sleep(retrySendSyncTaskInternal)
}
}
dp.Lock()
defer dp.Unlock()
if err != nil {
dp.Hosts = oldHosts
dp.Peers = oldPeers
return
}
log.LogInfof("action[addDataPartitionRaftMember] try host [%v] to [%v] peers [%v] to [%v]",
dp.Hosts, dp.Hosts, dp.Peers, dp.Peers)
if err = dp.update("addDataPartitionRaftMember", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
dp.Hosts = oldHosts
dp.Peers = oldPeers
return
}
return
}
func (c *Cluster) createDataReplica(dp *DataPartition, addPeer proto.Peer) (err error) {
vol, err := c.getVol(dp.VolName)
if err != nil {
return
}
dp.RLock()
hosts := make([]string, len(dp.Hosts))
copy(hosts, dp.Hosts)
peers := make([]proto.Peer, len(dp.Peers))
copy(peers, dp.Peers)
dp.RUnlock()
diskPath, err := c.syncCreateDataPartitionToDataNode(addPeer.Addr, vol.dataPartitionSize,
dp, peers, hosts, proto.DecommissionedCreateDataPartition, dp.PartitionType, true)
if err != nil {
return
}
dp.Lock()
defer dp.Unlock()
if err = dp.afterCreation(addPeer.Addr, diskPath, c); err != nil {
return
}
if err = dp.update("createDataReplica", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
return
}
return
}
func (c *Cluster) removeDataReplica(dp *DataPartition, addr string, validate bool, raftForceDel bool) (err error) {
defer func() {
if err != nil {
log.LogErrorf("action[removeDataReplica],vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
}
}()
log.LogInfof("action[removeDataReplica] dp %v try remove replica addr [%v]", dp.PartitionID, addr)
// validate be set true only in api call
if validate && !raftForceDel {
if err = c.validateDecommissionDataPartition(dp, addr); err != nil {
return
}
}
dataNode, err := c.dataNode(addr)
if err != nil {
return
}
if !proto.IsNormalDp(dp.PartitionType) {
return fmt.Errorf("[%d] is not normal dp, not support add or delete replica", dp.PartitionID)
}
removePeer := proto.Peer{ID: dataNode.ID, Addr: addr}
if err = c.removeDataPartitionRaftMember(dp, removePeer, raftForceDel); err != nil {
return
}
if err = c.removeHostMember(dp, removePeer); err != nil {
return
}
if err = c.deleteDataReplica(dp, dataNode); err != nil {
return
}
// may already change leader during last decommission
leaderAddr := dp.getLeaderAddrWithLock()
if leaderAddr != addr {
return
}
if dataNode, err = c.dataNode(dp.Hosts[0]); err != nil {
return
}
if err = dp.tryToChangeLeader(c, dataNode); err != nil {
return
}
return
}
func (c *Cluster) isRecovering(dp *DataPartition, addr string) (isRecover bool) {
var key string
dp.RLock()
defer dp.RUnlock()
replica, _ := dp.getReplica(addr)
if replica != nil {
key = fmt.Sprintf("%s:%s", addr, replica.DiskPath)
} else {
key = fmt.Sprintf("%s:%s", addr, "")
}
c.badPartitionMutex.RLock()
defer c.badPartitionMutex.RUnlock()
var badPartitionIDs []uint64
badPartitions, ok := c.BadDataPartitionIds.Load(key)
if ok {
badPartitionIDs = badPartitions.([]uint64)
}
for _, id := range badPartitionIDs {
if id == dp.PartitionID {
isRecover = true
}
}
return
}
func (c *Cluster) removeHostMember(dp *DataPartition, removePeer proto.Peer) (err error) {
newHosts := make([]string, 0, len(dp.Hosts)-1)
for _, host := range dp.Hosts {
if host == removePeer.Addr {
continue
}
newHosts = append(newHosts, host)
}
newPeers := make([]proto.Peer, 0, len(dp.Peers)-1)
for _, peer := range dp.Peers {
if peer.ID == removePeer.ID && peer.Addr == removePeer.Addr {
continue
}
newPeers = append(newPeers, peer)
}
dp.Lock()
defer dp.Unlock()
if err = dp.update("removeDataPartitionRaftMember", dp.VolName, newPeers, newHosts, c); err != nil {
return
}
return
}
func (c *Cluster) removeDataPartitionRaftMember(dp *DataPartition, removePeer proto.Peer, force bool) (err error) {
dp.offlineMutex.Lock()
defer dp.offlineMutex.Unlock()
defer func() {
if err1 := c.updateDataPartitionOfflinePeerIDWithLock(dp, 0); err1 != nil {
err = errors.Trace(err, "updateDataPartitionOfflinePeerIDWithLock failed, err[%v]", err1)
}
}()
if err = c.updateDataPartitionOfflinePeerIDWithLock(dp, removePeer.ID); err != nil {
log.LogErrorf("action[removeDataPartitionRaftMember] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
return
}
return dp.createTaskToRemoveRaftMember(c, removePeer, force)
}
// call from remove raft member
func (c *Cluster) updateDataPartitionOfflinePeerIDWithLock(dp *DataPartition, peerID uint64) (err error) {
dp.Lock()
defer dp.Unlock()
dp.OfflinePeerID = peerID
if err = dp.update("updateDataPartitionOfflinePeerIDWithLock", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
return
}
return
}
func (c *Cluster) deleteDataReplica(dp *DataPartition, dataNode *DataNode) (err error) {
dp.Lock()
// in case dataNode is unreachable,update meta first.
dp.removeReplicaByAddr(dataNode.Addr)
dp.checkAndRemoveMissReplica(dataNode.Addr)
if err = dp.update("deleteDataReplica", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
dp.Unlock()
return
}
task := dp.createTaskToDeleteDataPartition(dataNode.Addr)
dp.Unlock()
_, err = dataNode.TaskManager.syncSendAdminTask(task)
if err != nil {
log.LogErrorf("action[deleteDataReplica] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
}
return nil
}
func (c *Cluster) putBadMetaPartitions(addr string, partitionID uint64) {
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
newBadPartitionIDs := make([]uint64, 0)
badPartitionIDs, ok := c.BadMetaPartitionIds.Load(addr)
if ok {
newBadPartitionIDs = badPartitionIDs.([]uint64)
}
newBadPartitionIDs = append(newBadPartitionIDs, partitionID)
c.BadMetaPartitionIds.Store(addr, newBadPartitionIDs)
}
func (c *Cluster) getBadMetaPartitionsView() (bmpvs []badPartitionView) {
c.badPartitionMutex.RLock()
defer c.badPartitionMutex.RUnlock()
bmpvs = make([]badPartitionView, 0)
c.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
badPartitionIds := value.([]uint64)
path := key.(string)
bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds}
bmpvs = append(bmpvs, bpv)
return true
})
return
}
func (c *Cluster) putBadDataPartitionIDs(replica *DataReplica, addr string, partitionID uint64) {
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
var key string
newBadPartitionIDs := make([]uint64, 0)
if replica != nil {
key = fmt.Sprintf("%s:%s", addr, replica.DiskPath)
} else {
key = fmt.Sprintf("%s:%s", addr, "")
}
badPartitionIDs, ok := c.BadDataPartitionIds.Load(key)
if ok {
newBadPartitionIDs = badPartitionIDs.([]uint64)
}
newBadPartitionIDs = append(newBadPartitionIDs, partitionID)
c.BadDataPartitionIds.Store(key, newBadPartitionIDs)
}
func (c *Cluster) putBadDataPartitionIDsByDiskPath(disk, addr string, partitionID uint64) {
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
var key string
newBadPartitionIDs := make([]uint64, 0)
key = fmt.Sprintf("%s:%s", addr, disk)
badPartitionIDs, ok := c.BadDataPartitionIds.Load(key)
if ok {
newBadPartitionIDs = badPartitionIDs.([]uint64)
}
if in(partitionID, newBadPartitionIDs) {
return
}
newBadPartitionIDs = append(newBadPartitionIDs, partitionID)
c.BadDataPartitionIds.Store(key, newBadPartitionIDs)
}
func in(target uint64, strArray []uint64) bool {
for _, element := range strArray {
if target == element {
return true
}
}
return false
}
func (c *Cluster) getBadDataPartitionsView() (bpvs []badPartitionView) {
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
bpvs = make([]badPartitionView, 0)
c.BadDataPartitionIds.Range(func(key, value interface{}) bool {
badDataPartitionIds := value.([]uint64)
path := key.(string)
bpv := badPartitionView{Path: path, PartitionIDs: badDataPartitionIds}
bpvs = append(bpvs, bpv)
return true
})
return
}
func (c *Cluster) getBadDataPartitionsRepairView() (bprvs []proto.BadPartitionRepairView) {
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
bprvs = make([]proto.BadPartitionRepairView, 0)
c.BadDataPartitionIds.Range(func(key, value interface{}) bool {
badDataPartitionIds := value.([]uint64)
dpRepairInfos := make([]proto.DpRepairInfo, 0)
path := key.(string)
for _, partitionID := range badDataPartitionIds {
partition, err := c.getDataPartitionByID(partitionID)
if err != nil {
continue
}
replica, err := partition.getReplica(partition.DecommissionDstAddr)
if err != nil {
log.LogDebugf("getBadDataPartitionsRepairView: replica for partitionID[%v] addr[%v] is empty", partitionID, partition.DecommissionDstAddr)
continue
}
dpRepairInfo := proto.DpRepairInfo{PartitionID: partitionID, DecommissionRepairProgress: replica.DecommissionRepairProgress}
dpRepairInfos = append(dpRepairInfos, dpRepairInfo)
log.LogDebugf("getBadDataPartitionsRepairView: partitionID[%v], addr[%v], dpRepairInfo[%v]",
partitionID, partition.DecommissionDstAddr, dpRepairInfo)
}
bprv := proto.BadPartitionRepairView{Path: path, PartitionInfos: dpRepairInfos}
bprvs = append(bprvs, bprv)
return true
})
return
}
func (c *Cluster) migrateMetaNode(srcAddr, targetAddr string, limit int) (err error) {
var toBeOfflineMps []*MetaPartition
if c.ForbidMpDecommission {
err = fmt.Errorf("cluster mataPartition decommission switch is disabled")
return
}
msg := fmt.Sprintf("action[migrateMetaNode],clusterID[%v] migrate from node[%v] to [%s] begin", c.Name, srcAddr, targetAddr)
log.LogWarn(msg)
metaNode, err := c.metaNode(srcAddr)
if err != nil {
return err
}
metaNode.MigrateLock.Lock()
defer metaNode.MigrateLock.Unlock()
partitions := c.getAllMetaPartitionByMetaNode(srcAddr)
if targetAddr != "" {
toBeOfflineMps = make([]*MetaPartition, 0)
for _, mp := range partitions {
if contains(mp.Hosts, targetAddr) {
continue
}
toBeOfflineMps = append(toBeOfflineMps, mp)
}
} else {
toBeOfflineMps = partitions
}
if len(toBeOfflineMps) <= 0 && len(partitions) != 0 {
return fmt.Errorf("migrateMataNode no partition can migrate from [%s] to [%s] limit [%v]", srcAddr, targetAddr, limit)
}
if limit <= 0 {
limit = defaultMigrateMpCnt
}
if limit > len(toBeOfflineMps) {
limit = len(toBeOfflineMps)
}
var wg sync.WaitGroup
metaNode.ToBeOffline = true
metaNode.MaxMemAvailWeight = 1
errChannel := make(chan error, limit)
defer func() {
metaNode.ToBeOffline = false
close(errChannel)
}()
for idx := 0; idx < limit; idx++ {
wg.Add(1)
go func(mp *MetaPartition) {
defer wg.Done()
if err1 := c.migrateMetaPartition(srcAddr, targetAddr, mp); err1 != nil {
errChannel <- err1
}
}(toBeOfflineMps[idx])
}
wg.Wait()
select {
case err = <-errChannel:
log.LogErrorf("action[migrateMetaNode] clusterID[%v] migrate node[%s] to [%s] faild, err(%s)",
c.Name, srcAddr, targetAddr, err.Error())
return
default:
}
if limit < len(partitions) {
log.LogWarnf("action[migrateMetaNode] clusterID[%v] migrate from [%s] to [%s] cnt[%d] success",
c.Name, srcAddr, targetAddr, limit)
return
}
if err = c.syncDeleteMetaNode(metaNode); err != nil {
msg = fmt.Sprintf("action[migrateMetaNode], clusterID[%v] node[%v] synDelMetaNode failed,err[%s]",
c.Name, srcAddr, err.Error())
Warn(c.Name, msg)
return
}
c.deleteMetaNodeFromCache(metaNode)
msg = fmt.Sprintf("action[migrateMetaNode],clusterID[%v] migrate from node[%v] to node(%s) success", c.Name, srcAddr, targetAddr)
Warn(c.Name, msg)
return
}
func (c *Cluster) decommissionMetaNode(metaNode *MetaNode) (err error) {
return c.migrateMetaNode(metaNode.Addr, "", 0)
}
func (c *Cluster) deleteMetaNodeFromCache(metaNode *MetaNode) {
c.metaNodes.Delete(metaNode.Addr)
c.t.deleteMetaNode(metaNode)
go metaNode.clean()
}
func (c *Cluster) updateVol(name, authKey string, newArgs *VolVarargs) (err error) {
var (
vol *Vol
serverAuthKey string
volUsedSpace uint64
oldArgs *VolVarargs
)
if vol, err = c.getVol(name); err != nil {
log.LogErrorf("action[updateVol] err[%v]", err)
err = proto.ErrVolNotExists
goto errHandler
}
if vol.status() == proto.VolStatusMarkDelete {
log.LogErrorf("action[updateVol] vol is already deleted, name(%s)", name)
err = proto.ErrVolNotExists
goto errHandler
}
vol.volLock.Lock()
defer vol.volLock.Unlock()
serverAuthKey = vol.Owner
if !matchKey(serverAuthKey, authKey) {
return proto.ErrVolAuthKeyNotMatch
}
volUsedSpace = vol.totalUsedSpace()
if float64(newArgs.capacity*util.GB) < float64(volUsedSpace)*1.01 && newArgs.capacity != vol.Capacity {
err = fmt.Errorf("capacity[%v] has to be 1 percent larger than the used space[%v]", newArgs.capacity,
volUsedSpace/util.GB)
goto errHandler
}
log.LogInfof("[checkZoneName] name [%s], zone [%s]", name, newArgs.zoneName)
if newArgs.zoneName, err = c.checkZoneName(name, vol.crossZone, vol.defaultPriority, newArgs.zoneName, vol.domainId); err != nil {
goto errHandler
}
if newArgs.coldArgs.cacheCap >= newArgs.capacity {
err = fmt.Errorf("capacity must be large than cache capacity, newCap(%d), newCacheCap(%d)", newArgs.capacity, newArgs.coldArgs.cacheCap)
goto errHandler
}
oldArgs = getVolVarargs(vol)
setVolFromArgs(newArgs, vol)
if err = c.syncUpdateVol(vol); err != nil {
setVolFromArgs(oldArgs, vol)
log.LogErrorf("action[updateVol] vol[%v] err[%v]", name, err)
err = proto.ErrPersistenceByRaft
goto errHandler
}
return
errHandler:
err = fmt.Errorf("action[updateVol], clusterID[%v] name:%v, err:%v ", c.Name, name, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
func (c *Cluster) checkNormalZoneName(zoneName string) (err error) {
var zones []string
if c.needFaultDomain {
zones = c.t.domainExcludeZones
} else {
zones = c.t.getZoneNameList()
}
zoneList := strings.Split(zoneName, ",")
for i := 0; i < len(zoneList); i++ {
var isZone bool
for j := 0; j < len(zones); j++ {
if zoneList[i] == zones[j] {
isZone = true
break
}
}
if !isZone {
return fmt.Errorf("action[checkZoneName] the zonename[%s] not found", zoneList[i])
}
}
return
}
func (c *Cluster) checkZoneName(name string,
crossZone bool,
defaultPriority bool,
zoneName string,
domainId uint64) (newZoneName string, err error,
) {
zoneList := strings.Split(zoneName, ",")
newZoneName = zoneName
if crossZone {
if newZoneName != "" {
if len(zoneList) == 1 {
return newZoneName, fmt.Errorf("action[checkZoneName] vol use specified single zoneName conflit with cross zone flag")
} else {
if err = c.checkNormalZoneName(newZoneName); err != nil {
return newZoneName, err
}
}
}
if c.FaultDomain {
if newZoneName != "" {
if !defaultPriority || domainId > 0 {
return newZoneName, fmt.Errorf("action[checkZoneName] vol need FaultDomain but set zone name")
}
} else {
if domainId > 0 {
if _, ok := c.domainManager.domainId2IndexMap[domainId]; !ok {
return newZoneName, fmt.Errorf("action[checkZoneName] cluster can't find oomainId [%v]", domainId)
}
}
}
} else {
if c.t.zoneLen() <= 1 {
return newZoneName, fmt.Errorf("action[checkZoneName] cluster has one zone,can't cross zone")
}
}
} else { // cross zone disable means not use domain at the time vol be created
if newZoneName == "" {
if !c.needFaultDomain {
if _, err = c.t.getZone(DefaultZoneName); err != nil {
return newZoneName, fmt.Errorf("action[checkZoneName] the vol is not cross zone and didn't set zone name,but there's no default zone")
}
log.LogInfof("action[checkZoneName] vol [%v] use default zone", name)
newZoneName = DefaultZoneName
}
} else {
if len(zoneList) > 1 {
return newZoneName, fmt.Errorf("action[checkZoneName] vol specified zoneName need cross zone")
}
if err = c.checkNormalZoneName(newZoneName); err != nil {
return newZoneName, err
}
}
}
return
}
// Create a new volume.
// By default we create 3 meta partitions and 10 data partitions during initialization.
func (c *Cluster) createVol(req *createVolReq) (vol *Vol, err error) {
if c.DisableAutoAllocate {
log.LogWarn("the cluster is frozen")
return nil, fmt.Errorf("the cluster is frozen, can not create volume")
}
var readWriteDataPartitions int
if req.zoneName, err = c.checkZoneName(req.name, req.crossZone, req.normalZonesFirst, req.zoneName, req.domainId); err != nil {
return
}
if vol, err = c.doCreateVol(req); err != nil {
goto errHandler
}
vol.aclMgr.init(c, vol)
vol.initUidSpaceManager(c)
vol.initQuotaManager(c)
if err = vol.VersionMgr.init(c); err != nil {
log.LogError("init dataPartition error in verMgr init", err.Error())
}
if err = vol.initMetaPartitions(c, req.mpCount); err != nil {
vol.Status = proto.VolStatusMarkDelete
if e := vol.deleteVolFromStore(c); e != nil {
log.LogErrorf("action[createVol] deleteVolFromStore failed, vol[%v] err[%v]", vol.Name, e)
}
c.deleteVol(req.name)
err = fmt.Errorf("action[createVol] initMetaPartitions failed, vol[%v] err[%v]", vol.Name, err)
goto errHandler
}
if vol.CacheCapacity > 0 || (proto.IsHot(vol.VolType) && vol.Capacity > 0) {
if req.dpCount > maxInitDataPartitionCnt {
err = fmt.Errorf("action[createVol] initDataPartitions failed, vol[%v], dpCount[%d] exceeds maximum limit[%d]",
req.name, req.dpCount, maxInitDataPartitionCnt)
goto errHandler
}
for retryCount := 0; readWriteDataPartitions < defaultInitMetaPartitionCount && retryCount < 3; retryCount++ {
err = vol.initDataPartitions(c, req.dpCount)
if err != nil {
log.LogError("action[createVol] init dataPartition error ",
err.Error(), retryCount, len(vol.dataPartitions.partitionMap))
}
readWriteDataPartitions = len(vol.dataPartitions.partitionMap)
}
if len(vol.dataPartitions.partitionMap) < defaultInitMetaPartitionCount {
err = fmt.Errorf("action[createVol] vol[%v] initDataPartitions failed, less than %d",
vol.Name, defaultInitMetaPartitionCount)
oldVolStatus := vol.Status
vol.Status = proto.VolStatusMarkDelete
if errSync := c.syncUpdateVol(vol); errSync != nil {
log.LogErrorf("action[createVol] vol[%v] after init dataPartition error, mark vol delete persist failed", vol.Name)
vol.Status = oldVolStatus
} else {
log.LogErrorf("action[createVol] vol[%v] mark vol delete after init dataPartition error", vol.Name)
}
goto errHandler
}
}
vol.dataPartitions.readableAndWritableCnt = readWriteDataPartitions
vol.updateViewCache(c)
log.LogInfof("action[createVol] vol[%v], readableAndWritableCnt[%v]", req.name, readWriteDataPartitions)
return
errHandler:
err = fmt.Errorf("action[createVol], clusterID[%v] name:%v, err:%v ", c.Name, req.name, err)
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
func (c *Cluster) doCreateVol(req *createVolReq) (vol *Vol, err error) {
c.createVolMutex.Lock()
defer c.createVolMutex.Unlock()
createTime := time.Now().Unix() // record unix seconds of volume create time
var dataPartitionSize uint64
if req.dpSize*util.GB == 0 {
dataPartitionSize = util.DefaultDataPartitionSize
} else {
dataPartitionSize = uint64(req.dpSize) * util.GB
}
vv := volValue{
Name: req.name,
Owner: req.owner,
ZoneName: req.zoneName,
DataPartitionSize: dataPartitionSize,
Capacity: uint64(req.capacity),
DpReplicaNum: req.dpReplicaNum,
ReplicaNum: defaultReplicaNum,
FollowerRead: req.followerRead,
Authenticate: req.authenticate,
CrossZone: req.crossZone,
DefaultPriority: req.normalZonesFirst,
DomainId: req.domainId,
CreateTime: createTime,
DeleteLockTime: req.deleteLockTime,
Description: req.description,
EnablePosixAcl: req.enablePosixAcl,
EnableQuota: req.enableQuota,
EnableTransaction: req.enableTransaction,
TxTimeout: req.txTimeout,
TxConflictRetryNum: req.txConflictRetryNum,
TxConflictRetryInterval: req.txConflictRetryInterval,
VolType: req.volType,
EbsBlkSize: req.coldArgs.objBlockSize,
CacheCapacity: req.coldArgs.cacheCap,
CacheAction: req.coldArgs.cacheAction,
CacheThreshold: req.coldArgs.cacheThreshold,
CacheTTL: req.coldArgs.cacheTtl,
CacheHighWater: req.coldArgs.cacheHighWater,
CacheLowWater: req.coldArgs.cacheLowWater,
CacheLRUInterval: req.coldArgs.cacheLRUInterval,
CacheRule: req.coldArgs.cacheRule,
VolQosEnable: req.qosLimitArgs.qosEnable,
IopsRLimit: req.qosLimitArgs.iopsRVal,
IopsWLimit: req.qosLimitArgs.iopsWVal,
FlowRlimit: req.qosLimitArgs.flowRVal,
FlowWlimit: req.qosLimitArgs.flowWVal,
DpReadOnlyWhenVolFull: req.DpReadOnlyWhenVolFull,
}
log.LogInfof("[doCreateVol] volView, %v", vv)
if _, err = c.getVol(req.name); err == nil {
err = proto.ErrDuplicateVol
goto errHandler
}
vv.ID, err = c.idAlloc.allocateCommonID()
if err != nil {
goto errHandler
}
vol = newVol(vv)
log.LogInfof("[doCreateVol] vol, %v", vol)
// refresh oss secure
vol.refreshOSSSecure()
if err = c.syncAddVol(vol); err != nil {
goto errHandler
}
c.putVol(vol)
return
errHandler:
err = fmt.Errorf("action[doCreateVol], clusterID[%v] name:%v, err:%v ", c.Name, req.name, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
// Update the upper bound of the inode ids in a meta partition.
func (c *Cluster) updateInodeIDRange(volName string, start uint64) (err error) {
var (
maxPartitionID uint64
vol *Vol
partition *MetaPartition
)
if vol, err = c.getVol(volName); err != nil {
log.LogErrorf("action[updateInodeIDRange] vol [%v] not found", volName)
return proto.ErrVolNotExists
}
maxPartitionID = vol.maxPartitionID()
if partition, err = vol.metaPartition(maxPartitionID); err != nil {
log.LogErrorf("action[updateInodeIDRange] mp[%v] not found", maxPartitionID)
return proto.ErrMetaPartitionNotExists
}
adjustStart := start
if adjustStart < partition.Start {
adjustStart = partition.Start
}
if adjustStart < partition.MaxInodeID {
adjustStart = partition.MaxInodeID
}
metaPartitionInodeIdStep := gConfig.MetaPartitionInodeIdStep
adjustStart = adjustStart + metaPartitionInodeIdStep
log.LogWarnf("vol[%v],maxMp[%v],start[%v],adjustStart[%v]", volName, maxPartitionID, start, adjustStart)
if err = vol.splitMetaPartition(c, partition, adjustStart, metaPartitionInodeIdStep, false); err != nil {
log.LogErrorf("action[updateInodeIDRange] mp[%v] err[%v]", partition.PartitionID, err)
}
return
}
func (c *Cluster) dataNodeCount() (len int) {
c.dataNodes.Range(func(key, value interface{}) bool {
len++
return true
})
return
}
func (c *Cluster) metaNodeCount() (len int) {
c.metaNodes.Range(func(key, value interface{}) bool {
len++
return true
})
return
}
func (c *Cluster) allMasterNodes() (masterNodes []proto.NodeView) {
masterNodes = make([]proto.NodeView, 0)
for _, addr := range c.cfg.peerAddrs {
split := strings.Split(addr, colonSplit)
id, _ := strconv.ParseUint(split[0], 10, 64)
masterNode := proto.NodeView{ID: id, Addr: split[1] + ":" + split[2], IsActive: true}
masterNodes = append(masterNodes, masterNode)
}
return masterNodes
}
func (c *Cluster) lcNodeCount() (len int) {
c.lcNodes.Range(func(key, value interface{}) bool {
len++
return true
})
return
}
func (c *Cluster) allDataNodes() (dataNodes []proto.NodeView) {
dataNodes = make([]proto.NodeView, 0)
c.dataNodes.Range(func(addr, node interface{}) bool {
dataNode := node.(*DataNode)
dataNodes = append(dataNodes, proto.NodeView{
Addr: dataNode.Addr, DomainAddr: dataNode.DomainAddr,
IsActive: dataNode.isActive, ID: dataNode.ID, IsWritable: dataNode.isWriteAble(),
})
return true
})
return
}
func (c *Cluster) allMetaNodes() (metaNodes []proto.NodeView) {
metaNodes = make([]proto.NodeView, 0)
c.metaNodes.Range(func(addr, node interface{}) bool {
metaNode := node.(*MetaNode)
metaNodes = append(metaNodes, proto.NodeView{
ID: metaNode.ID, Addr: metaNode.Addr, DomainAddr: metaNode.DomainAddr,
IsActive: metaNode.IsActive, IsWritable: metaNode.isWritable(),
})
return true
})
return
}
// get metaNode with specified condition
func (c *Cluster) getSpecifiedMetaNodes(zones map[string]struct{}, nodeSetIds map[uint64]struct{}) (metaNodes []*MetaNode) {
log.LogInfof("cluster metaNode length:%v", c.allMetaNodes())
// if nodeSetId is set,choose metaNode which in nodesetId and ignore zones
if len(nodeSetIds) != 0 {
log.LogInfof("select from nodeSet")
c.metaNodes.Range(func(addr, node interface{}) bool {
metaNode := node.(*MetaNode)
if _, ok := nodeSetIds[metaNode.NodeSetID]; ok {
metaNodes = append(metaNodes, metaNode)
}
return true
})
return
}
// if zones is set, choose metaNodes which in zones
if len(zones) != 0 {
log.LogInfof("select from zone")
c.metaNodes.Range(func(addr, node interface{}) bool {
metaNode := node.(*MetaNode)
if _, ok := zones[metaNode.ZoneName]; ok {
metaNodes = append(metaNodes, metaNode)
}
return true
})
return
}
log.LogInfof("select all cluster metaNode")
// get all metaNodes in cluster
c.metaNodes.Range(func(addr, node interface{}) bool {
metaNode := node.(*MetaNode)
metaNodes = append(metaNodes, metaNode)
return true
})
return
}
func (c *Cluster) balanceMetaPartitionLeader(zones map[string]struct{}, nodeSetIds map[uint64]struct{}) error {
sortedNodes := c.getSortLeaderMetaNodes(zones, nodeSetIds)
if sortedNodes == nil || len(sortedNodes.nodes) == 0 {
return errors.New("no metaNode be selected")
}
sortedNodes.balanceLeader()
return nil
}
func (c *Cluster) getSortLeaderMetaNodes(zones map[string]struct{}, nodeSetIds map[uint64]struct{}) *sortLeaderMetaNode {
metaNodes := c.getSpecifiedMetaNodes(zones, nodeSetIds)
log.LogInfof("metaNode length:%d", len(metaNodes))
if len(metaNodes) == 0 {
return nil
}
leaderNodes := make([]*LeaderMetaNode, 0)
countM := make(map[string]int)
totalCount := 0
average := 0
for _, node := range metaNodes {
metaPartitions := make([]*MetaPartition, 0)
for _, mp := range node.metaPartitionInfos {
if mp.IsLeader {
metaPartition, err := c.getMetaPartitionByID(mp.PartitionID)
if err != nil {
continue
}
metaPartitions = append(metaPartitions, metaPartition)
}
}
// some metaNode's mps length could be 0
leaderNodes = append(leaderNodes, &LeaderMetaNode{
metaPartitions: metaPartitions,
addr: node.Addr,
})
countM[node.Addr] = len(metaPartitions)
totalCount += len(metaPartitions)
}
if len(leaderNodes) != 0 {
average = totalCount / len(leaderNodes)
}
s := &sortLeaderMetaNode{
nodes: leaderNodes,
leaderCountM: countM,
average: average,
}
sort.Sort(s)
return s
}
func (c *Cluster) allVolNames() (vols []string) {
vols = make([]string, 0)
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for name := range c.vols {
vols = append(vols, name)
}
return
}
func (c *Cluster) copyVols() (vols map[string]*Vol) {
vols = make(map[string]*Vol, 0)
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for name, vol := range c.vols {
vols[name] = vol
}
return
}
// Return all the volumes except the ones that have been marked to be deleted.
func (c *Cluster) allVols() (vols map[string]*Vol) {
vols = make(map[string]*Vol, 0)
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for name, vol := range c.vols {
if vol.Status == proto.VolStatusNormal {
vols[name] = vol
}
}
return
}
func (c *Cluster) getDataPartitionCount() (count int) {
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for _, vol := range c.vols {
count = count + len(vol.dataPartitions.partitions)
}
return
}
func (c *Cluster) getMetaPartitionCount() (count int) {
vols := c.copyVols()
for _, vol := range vols {
vol.mpsLock.RLock()
count = count + len(vol.MetaPartitions)
vol.mpsLock.RUnlock()
}
return count
}
func (c *Cluster) setClusterInfo(dirLimit uint32) (err error) {
oldLimit := c.cfg.DirChildrenNumLimit
atomic.StoreUint32(&c.cfg.DirChildrenNumLimit, dirLimit)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setClusterInfo] err[%v]", err)
atomic.StoreUint32(&c.cfg.DirChildrenNumLimit, oldLimit)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) getMonitorPushAddr() (addr string) {
addr = c.cfg.MonitorPushAddr
return
}
func (c *Cluster) setMetaNodeThreshold(threshold float32) (err error) {
if threshold > 1.0 || threshold < 0.0 {
err = fmt.Errorf("set threshold failed: threshold (%v) should between 0.0 and 1.0", threshold)
return
}
oldThreshold := c.cfg.MetaNodeThreshold
c.cfg.MetaNodeThreshold = threshold
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setMetaNodeThreshold] err[%v]", err)
c.cfg.MetaNodeThreshold = oldThreshold
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setMetaNodeDeleteBatchCount(val uint64) (err error) {
oldVal := atomic.LoadUint64(&c.cfg.MetaNodeDeleteBatchCount)
atomic.StoreUint64(&c.cfg.MetaNodeDeleteBatchCount, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setMetaNodeDeleteBatchCount] err[%v]", err)
atomic.StoreUint64(&c.cfg.MetaNodeDeleteBatchCount, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setClusterLoadFactor(factor float32) (err error) {
oldVal := c.cfg.ClusterLoadFactor
c.cfg.ClusterLoadFactor = factor
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setClusterLoadFactorErr] err[%v]", err)
c.cfg.ClusterLoadFactor = oldVal
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setDataNodeDeleteLimitRate(val uint64) (err error) {
oldVal := atomic.LoadUint64(&c.cfg.DataNodeDeleteLimitRate)
atomic.StoreUint64(&c.cfg.DataNodeDeleteLimitRate, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setDataNodeDeleteLimitRate] err[%v]", err)
atomic.StoreUint64(&c.cfg.DataNodeDeleteLimitRate, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setDataPartitionMaxRepairErrCnt(val uint64) (err error) {
oldVal := atomic.LoadUint64(&c.cfg.DpMaxRepairErrCnt)
atomic.StoreUint64(&c.cfg.DpMaxRepairErrCnt, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setDataPartitionMaxRepairErrCnt] err[%v]", err)
atomic.StoreUint64(&c.cfg.DpMaxRepairErrCnt, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setDataPartitionRepairTimeOut(val uint64) (err error) {
oldVal := atomic.LoadUint64(&c.cfg.DpRepairTimeOut)
atomic.StoreUint64(&c.cfg.DpRepairTimeOut, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setDataPartitionRepairTimeOut] err[%v]", err)
atomic.StoreUint64(&c.cfg.DpRepairTimeOut, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setDataNodeAutoRepairLimitRate(val uint64) (err error) {
oldVal := atomic.LoadUint64(&c.cfg.DataNodeAutoRepairLimitRate)
atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setDataNodeAutoRepairLimitRate] err[%v]", err)
atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setMetaNodeDeleteWorkerSleepMs(val uint64) (err error) {
oldVal := atomic.LoadUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs)
atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setMetaNodeDeleteWorkerSleepMs] err[%v]", err)
atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) getMaxDpCntLimit() (dpCntInLimit uint64) {
dpCntInLimit = atomic.LoadUint64(&c.cfg.MaxDpCntLimit)
return
}
func (c *Cluster) setMaxDpCntLimit(val uint64) (err error) {
if val == 0 {
val = defaultMaxDpCntLimit
}
oldVal := atomic.LoadUint64(&c.cfg.MaxDpCntLimit)
atomic.StoreUint64(&c.cfg.MaxDpCntLimit, val)
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[MaxDpCntLimit] err[%v]", err)
atomic.StoreUint64(&c.cfg.MaxDpCntLimit, oldVal)
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setClusterCreateTime(createTime int64) (err error) {
oldVal := c.CreateTime
c.CreateTime = createTime
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setClusterCreateTime] err[%v]", err)
c.CreateTime = oldVal
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setDisableAutoAllocate(disableAutoAllocate bool) (err error) {
oldFlag := c.DisableAutoAllocate
c.DisableAutoAllocate = disableAutoAllocate
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setDisableAutoAllocate] err[%v]", err)
c.DisableAutoAllocate = oldFlag
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setForbidMpDecommission(isForbid bool) (err error) {
oldFlag := c.ForbidMpDecommission
c.ForbidMpDecommission = isForbid
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setForbidMpDecommission] err[%v]", err)
c.ForbidMpDecommission = oldFlag
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) setMaxConcurrentLcNodes(count uint64) (err error) {
oldCount := c.cfg.MaxConcurrentLcNodes
c.cfg.MaxConcurrentLcNodes = count
if err = c.syncPutCluster(); err != nil {
log.LogErrorf("action[setMaxConcurrentLcNodes] err[%v]", err)
c.cfg.MaxConcurrentLcNodes = oldCount
err = proto.ErrPersistenceByRaft
return
}
return
}
func (c *Cluster) clearVols() {
c.volMutex.Lock()
defer c.volMutex.Unlock()
c.vols = make(map[string]*Vol, 0)
}
func (c *Cluster) clearTopology() {
c.t.clear()
}
func (c *Cluster) clearDataNodes() {
c.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
c.dataNodes.Delete(key)
dataNode.clean()
return true
})
}
func (c *Cluster) clearMetaNodes() {
c.metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
c.metaNodes.Delete(key)
metaNode.clean()
return true
})
}
func (c *Cluster) scheduleToCheckDecommissionDataNode() {
go func() {
for {
if c.partition.IsRaftLeader() && c.metaReady {
c.checkDecommissionDataNode()
}
time.Sleep(10 * time.Second)
}
}()
}
func (c *Cluster) checkDecommissionDataNode() {
// decommission datanode mark
c.dataNodes.Range(func(addr, node interface{}) bool {
dataNode := node.(*DataNode)
dataNode.updateDecommissionStatus(c, false)
if dataNode.GetDecommissionStatus() == markDecommission {
c.TryDecommissionDataNode(dataNode)
} else if dataNode.GetDecommissionStatus() == DecommissionSuccess {
partitions := c.getAllDataPartitionByDataNode(dataNode.Addr)
// if only decommission part of data partitions, do not remove the datanode
if len(partitions) != 0 {
if time.Now().Sub(time.Unix(dataNode.DecommissionCompleteTime, 0)) > (20 * time.Minute) {
log.LogWarnf("action[checkDecommissionDataNode] dataNode %v decommission completed, "+
"but has dp left, so only reset decommission status", dataNode.Addr)
dataNode.resetDecommissionStatus()
}
return true
}
if err := c.syncDeleteDataNode(dataNode); err != nil {
msg := fmt.Sprintf("action[checkDecommissionDataNode],clusterID[%v] Node[%v] syncDeleteDataNode failed,err[%v]",
c.Name, dataNode.Addr, err)
log.LogWarnf("%s", msg)
} else {
log.LogWarnf("action[checkDecommissionDataNode] del dataNode %v", dataNode.Addr)
c.delDataNodeFromCache(dataNode)
}
}
return true
})
}
func (c *Cluster) TryDecommissionDataNode(dataNode *DataNode) {
var (
toBeOffLinePartitions []*DataPartition
err error
)
log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] limit[%v]", dataNode.Addr, dataNode.DecommissionLimit)
dataNode.MigrateLock.Lock()
defer func() {
dataNode.MigrateLock.Unlock()
if err != nil {
dataNode.DecommissionRetry++
log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] retry %v", dataNode.Addr, dataNode.DecommissionRetry)
}
c.syncUpdateDataNode(dataNode)
}()
// recover from stop
if len(dataNode.DecommissionDiskList) != 0 {
for _, disk := range dataNode.DecommissionDiskList {
key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
// if not found, may already success, so only care running disk
if value, ok := c.DecommissionDisks.Load(key); ok {
dd := value.(*DecommissionDisk)
if dd.GetDecommissionStatus() == DecommissionPause {
dd.SetDecommissionStatus(markDecommission)
log.LogInfof("action[TryDecommissionDataNode] dataNode [%s] restore %v from stop",
dataNode.Addr, dd.GenerateKey())
}
}
}
dataNode.SetDecommissionStatus(DecommissionPrepare)
dataNode.ToBeOffline = true
log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] recover from DecommissionDiskList", dataNode.Addr)
return
}
log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] prepare to decommission", dataNode.Addr)
var partitions []*DataPartition
disks := dataNode.getDisks(c)
for _, disk := range disks {
partitionsFromDisk := dataNode.badPartitions(disk, c)
partitions = append(partitions, partitionsFromDisk...)
}
// may allocate new dp when dataNode cancel decommission before
// partitions := c.getAllDataPartitionByDataNode(dataNode.Addr)
if dataNode.DecommissionDstAddr != "" {
for _, dp := range partitions {
// two replica can't exist on same node
if dp.hasHost(dataNode.DecommissionDstAddr) {
log.LogWarnf("action[TryDecommissionDataNode] skip dp [%v] on both data node", dp.PartitionID)
continue
}
toBeOffLinePartitions = append(toBeOffLinePartitions, dp)
}
} else {
toBeOffLinePartitions = partitions
}
if len(toBeOffLinePartitions) <= 0 && len(partitions) != 0 {
err = fmt.Errorf("DecommissionDataNode no partition can migrate from [%s] to [%s] for replica address conflict",
dataNode.Addr, dataNode.DecommissionDstAddr)
log.LogWarnf("action[TryDecommissionDataNode] %v", err.Error())
return
}
// check decommission dp last time
oldPartitions := c.getAllDecommissionDataPartitionByDataNode(dataNode.Addr)
if len(oldPartitions) != 0 {
toBeOffLinePartitions = mergeDataPartitionArr(toBeOffLinePartitions, oldPartitions)
}
if !(dataNode.DecommissionLimit == 0 || dataNode.DecommissionLimit > len(toBeOffLinePartitions)) {
toBeOffLinePartitions = toBeOffLinePartitions[:dataNode.DecommissionLimit]
}
if len(toBeOffLinePartitions) == 0 {
dataNode.markDecommissionSuccess(c)
return
}
// recode dp count in each disk
dpToDecommissionByDisk := make(map[string]int)
// find respond disk
for _, dp := range toBeOffLinePartitions {
disk := dp.getReplicaDisk(dataNode.Addr)
if disk == "" {
log.LogWarnf("action[TryDecommissionDataNode] ignore dp [%v] on dataNode[%v]with empty disk",
dp.PartitionID, dataNode.Addr)
if dp.IsDecommissionSuccess() {
dp.ResetDecommissionStatus()
c.syncUpdateDataPartition(dp)
}
continue
}
dpToDecommissionByDisk[disk]++
}
decommissionDpTotal := 0
left := len(toBeOffLinePartitions)
decommissionDiskList := make([]string, 0)
for disk, dpCnt := range dpToDecommissionByDisk {
//
if left == 0 {
break
}
if left-dpCnt >= 0 {
err = c.migrateDisk(dataNode.Addr, disk, dataNode.DecommissionDstAddr, dataNode.DecommissionRaftForce, dpCnt, true, ManualDecommission)
if err != nil {
log.LogWarnf("action[TryDecommissionDataNode] %v failed", err)
continue
}
decommissionDpTotal += dpCnt
left = left - dpCnt
} else {
err = c.migrateDisk(dataNode.Addr, disk, dataNode.DecommissionDstAddr, dataNode.DecommissionRaftForce, left, true, ManualDecommission)
if err != nil {
log.LogWarnf("action[TryDecommissionDataNode] %v failed", err)
continue
}
decommissionDpTotal += left
left = 0
}
decommissionDiskList = append(decommissionDiskList, disk)
}
//put all dp to nodeset's decommission list
//for _, dp := range toBeOffLinePartitions {
// dp.MarkDecommissionStatus(dataNode.Addr, dataNode.DecommissionDstAddr, "",
// dataNode.DecommissionRaftForce, dataNode.DecommissionTerm, c)
// c.syncUpdateDataPartition(dp)
// ns.AddToDecommissionDataPartitionList(dp)
// toBeOffLinePartitionIds = append(toBeOffLinePartitionIds, dp.PartitionID)
//}
//disk wait for decommission
dataNode.SetDecommissionStatus(DecommissionPrepare)
// avoid alloc dp on this node
dataNode.ToBeOffline = true
dataNode.DecommissionDiskList = decommissionDiskList
dataNode.DecommissionDpTotal = decommissionDpTotal
log.LogInfof("action[TryDecommissionDataNode] try decommission disk[%v] from dataNode[%s] "+
"raftForce [%v] to dst [%v] DecommissionDpTotal[%v]",
decommissionDiskList, dataNode.Addr, dataNode.DecommissionRaftForce,
dataNode.DecommissionDstAddr, dataNode.DecommissionDpTotal)
}
func (c *Cluster) migrateDisk(nodeAddr, diskPath, dstPath string, raftForce bool, limit int, diskDisable bool, migrateType uint32) (err error) {
var disk *DecommissionDisk
key := fmt.Sprintf("%s_%s", nodeAddr, diskPath)
if value, ok := c.DecommissionDisks.Load(key); ok {
disk = value.(*DecommissionDisk)
status := disk.GetDecommissionStatus()
if status == markDecommission || status == DecommissionRunning {
err = fmt.Errorf("migrate src(%v) diskPath(%v)s still on working, please wait,check or cancel if abnormal",
nodeAddr, diskPath)
log.LogWarnf("action[addDecommissionDisk] %v", err)
return
}
} else {
disk = &DecommissionDisk{
SrcAddr: nodeAddr,
DiskPath: diskPath,
DiskDisable: diskDisable,
}
c.DecommissionDisks.Store(disk.GenerateKey(), disk)
}
disk.Type = migrateType
// disk should be decommission all the dp
disk.markDecommission(dstPath, raftForce, limit)
if err = c.syncAddDecommissionDisk(disk); err != nil {
err = fmt.Errorf("action[addDecommissionDisk],clusterID[%v] dataNodeAddr:%v diskPath:%v err:%v ",
c.Name, nodeAddr, diskPath, err.Error())
Warn(c.Name, err.Error())
c.delDecommissionDiskFromCache(disk)
return
}
// add to the nodeset decommission list
c.addDecommissionDiskToNodeset(disk)
log.LogInfof("action[addDecommissionDisk],clusterID[%v] dataNodeAddr:%v,diskPath[%v] raftForce [%v] "+
"limit [%v], diskDisable [%v], migrateType [%v] term [%v]",
c.Name, nodeAddr, diskPath, raftForce, limit, diskDisable, migrateType, disk.DecommissionTerm)
return
}
func (c *Cluster) restoreStoppedAutoDecommissionDisk(nodeAddr, diskPath string) (err error) {
var disk *DecommissionDisk
key := fmt.Sprintf("%s_%s", nodeAddr, diskPath)
if value, ok := c.DecommissionDisks.Load(key); !ok {
disk = value.(*DecommissionDisk)
} else {
return errors.NewErrorf("cannot find auto decommission disk %v", key)
}
if disk.GetDecommissionStatus() != DecommissionPause {
err = fmt.Errorf("decommission disk [%v]is not stopped: %v", key, disk.GetDecommissionStatus())
log.LogWarnf("action[restoreStoppedAutoDecommissionDisk] %v", err)
return
}
if disk.IsManualDecommissionDisk() {
err = fmt.Errorf("decommission disk [%v]is not manual decommission type: %v", key, disk.Type)
log.LogWarnf("action[restoreStoppedAutoDecommissionDisk] %v", err)
return
}
disk.SetDecommissionStatus(markDecommission)
c.syncAddDecommissionDisk(disk)
log.LogInfof("action[restoreStoppedAutoDecommissionDisk],clusterID[%v] dataNodeAddr:%v,diskPath[%v] ",
c.Name, nodeAddr, diskPath)
return
}
func (c *Cluster) scheduleToCheckDecommissionDisk() {
go func() {
for {
if c.partition.IsRaftLeader() && c.metaReady {
c.checkDecommissionDisk()
}
time.Sleep(10 * time.Second)
}
}()
}
func (c *Cluster) checkDecommissionDisk() {
// decommission disk mark
c.DecommissionDisks.Range(func(key, value interface{}) bool {
disk := value.(*DecommissionDisk)
status := disk.GetDecommissionStatus()
if status == DecommissionSuccess || status == DecommissionFail {
if time.Now().Sub(time.Unix(disk.DecommissionCompleteTime, 0)) > (20 * time.Minute) {
if err := c.syncDeleteDecommissionDisk(disk); err != nil {
msg := fmt.Sprintf("action[checkDecommissionDisk],clusterID[%v] node[%v] disk[%v],"+
"syncDeleteDecommissionDisk failed,err[%v]",
c.Name, disk.SrcAddr, disk.DiskPath, err)
log.LogWarnf("%s", msg)
} else {
c.delDecommissionDiskFromCache(disk)
log.LogDebugf("action[checkDecommissionDisk] delete DecommissionDisk[%s] status(%v)",
disk.GenerateKey(), status)
}
}
}
return true
})
}
func (c *Cluster) scheduleToBadDisk() {
go func() {
for {
if c.partition.IsRaftLeader() {
c.checkBadDisk()
}
time.Sleep(10 * time.Second)
}
}()
}
func (c *Cluster) checkBadDisk() {
c.dataNodes.Range(func(addr, node interface{}) bool {
//TODO add to auto decommission disk
//dataNode, ok := node.(*DataNode)
//if !ok {
// return true
//}
//for _, badDisk := range dataNode.BadDisks {
//
//}
return true
})
}
func (c *Cluster) TryDecommissionDisk(disk *DecommissionDisk) {
var (
node *DataNode
err error
badPartitionIds []uint64
badPartitions []*DataPartition
rstMsg string
zone *Zone
ns *nodeSet
)
defer func() {
if err != nil {
disk.DecommissionRetry++
}
c.syncUpdateDecommissionDisk(disk)
}()
if node, err = c.dataNode(disk.SrcAddr); err != nil {
log.LogWarnf("action[TryDecommissionDisk] cannot find dataNode[%s]", disk.SrcAddr)
disk.markDecommissionFailed()
return
}
badPartitions = node.badPartitions(disk.DiskPath, c)
// check decommission dp last time
lastBadPartitions := c.getAllDecommissionDataPartitionByDisk(disk.SrcAddr, disk.DiskPath)
badPartitions = mergeDataPartitionArr(badPartitions, lastBadPartitions)
if len(badPartitions) == 0 {
log.LogInfof("action[TryDecommissionDisk] receive decommissionDisk node[%v] "+
"no any partitions on disk[%v],offline successfully",
node.Addr, disk.DiskPath)
disk.markDecommissionSuccess()
disk.DecommissionDpTotal = 0
if disk.DiskDisable {
c.addAndSyncDecommissionedDisk(node, disk.DiskPath)
}
return
}
// recover from pause
if disk.DecommissionDpTotal != InvalidDecommissionDpCnt {
badPartitions = lastBadPartitions
} else { // the first time for decommission
if disk.DecommissionDpCount == 0 || disk.DecommissionDpCount > len(badPartitions) {
disk.DecommissionDpTotal = len(badPartitions)
} else {
disk.DecommissionDpTotal = disk.DecommissionDpCount
badPartitions = badPartitions[:disk.DecommissionDpCount]
}
}
if zone, err = c.t.getZone(node.ZoneName); err != nil {
log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] zone failed[%v]",
node.Addr, err.Error())
disk.markDecommissionFailed()
return
}
if ns, err = zone.getNodeSet(node.NodeSetID); err != nil {
log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] nodeset[%v] failed[%v]",
node.Addr, node.NodeSetID, err.Error())
disk.markDecommissionFailed()
return
}
for _, dp := range badPartitions {
// dp with decommission success cannot be reset during master load metadata
if dp.IsDecommissionSuccess() && dp.DecommissionTerm == disk.DecommissionTerm {
log.LogInfof("action[TryDecommissionDisk] reset dp [%v] decommission status for disk %v:%v",
dp.PartitionID, disk.SrcAddr, disk.DiskPath)
dp.ResetDecommissionStatus()
c.syncUpdateDataPartition(dp)
disk.DecommissionDpTotal -= 1
continue
}
if !dp.MarkDecommissionStatus(node.Addr, disk.DstAddr, disk.DiskPath, disk.DecommissionRaftForce, disk.DecommissionTerm, c) {
continue
}
c.syncUpdateDataPartition(dp)
ns.AddToDecommissionDataPartitionList(dp, c)
badPartitionIds = append(badPartitionIds, dp.PartitionID)
}
disk.SetDecommissionStatus(DecommissionRunning)
if disk.DiskDisable {
c.addAndSyncDecommissionedDisk(node, disk.DiskPath)
}
rstMsg = fmt.Sprintf("receive decommissionDisk node[%v] disk[%v],badPartitionIds %v,raftForce %v"+
" DecommissionDpTotal %v term %v Type[%v] has offline to [%v]successfully",
node.Addr, disk.DiskPath, badPartitionIds, disk.DecommissionRaftForce,
disk.DecommissionDpTotal, disk.DecommissionTerm, disk.Type, disk.DstAddr)
log.LogInfof("action[TryDecommissionDisk] %s", rstMsg)
}
func (c *Cluster) getAllDecommissionDataPartitionByDataNode(addr string) (partitions []*DataPartition) {
partitions = make([]*DataPartition, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, dp := range vol.dataPartitions.partitions {
if dp.DecommissionSrcAddr == addr {
partitions = append(partitions, dp)
}
}
}
return
}
func (c *Cluster) getAllDecommissionDataPartitionByDiskAndTerm(addr, disk string, term uint64) (partitions []*DataPartition) {
partitions = make([]*DataPartition, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, dp := range vol.dataPartitions.partitions {
if dp.DecommissionSrcAddr == addr && dp.DecommissionSrcDiskPath == disk && dp.DecommissionTerm == term {
partitions = append(partitions, dp)
}
}
}
return
}
func (c *Cluster) getAllDecommissionDataPartitionByDisk(addr, disk string) (partitions []*DataPartition) {
partitions = make([]*DataPartition, 0)
safeVols := c.allVols()
for _, vol := range safeVols {
for _, dp := range vol.dataPartitions.partitions {
if dp.DecommissionSrcAddr == addr && dp.DecommissionSrcDiskPath == disk {
partitions = append(partitions, dp)
}
}
}
return
}
func (c *Cluster) listQuotaAll() (volsInfo []*proto.VolInfo) {
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for _, vol := range c.vols {
if vol.quotaManager.HasQuota() {
stat := volStat(vol, false)
volInfo := proto.NewVolInfo(vol.Name, vol.Owner, vol.createTime, vol.status(), stat.TotalSize,
stat.UsedSize, stat.DpReadOnlyWhenVolFull)
volsInfo = append(volsInfo, volInfo)
}
}
return
}
func mergeDataPartitionArr(newDps, oldDps []*DataPartition) []*DataPartition {
ret := make([]*DataPartition, 0)
tempMap := make(map[uint64]bool)
for _, v := range newDps {
ret = append(ret, v)
tempMap[v.PartitionID] = true
}
for _, v := range oldDps {
if !tempMap[v.PartitionID] {
ret = append(ret, v)
tempMap[v.PartitionID] = true
}
}
return ret
}
func (c *Cluster) generateClusterUuid() (err error) {
cid := "CID-" + uuid.NewString()
c.clusterUuid = cid
if err := c.syncPutCluster(); err != nil {
c.clusterUuid = ""
return errors.NewErrorf(fmt.Sprintf("syncPutCluster failed %v", err.Error()))
}
return
}
func (c *Cluster) initAuthentication(cfg *config.Config) {
var (
authnodes []string
enableHTTPS bool
certFile string
)
authNodeHostConfig := cfg.GetString(AuthNodeHost)
authnodes = strings.Split(authNodeHostConfig, ",")
enableHTTPS = cfg.GetBool(AuthNodeEnableHTTPS)
if enableHTTPS {
certFile = cfg.GetString(AuthNodeCertFile)
}
c.ac = authSDK.NewAuthClient(authnodes, enableHTTPS, certFile)
}
func (c *Cluster) parseAndCheckClientIDKey(r *http.Request, Type proto.MsgType) (err error) {
var (
clientIDKey string
clientID string
clientKey []byte
)
if err = r.ParseForm(); err != nil {
return
}
if clientIDKey, err = extractClientIDKey(r); err != nil {
return
}
if clientID, clientKey, err = proto.ExtractIDAndAuthKey(clientIDKey); err != nil {
return
}
if err = proto.IsValidClientID(clientID); err != nil {
return
}
ticket, err := c.ac.API().GetTicket(clientID, string(clientKey), proto.MasterServiceID)
if err != nil {
err = fmt.Errorf("get ticket from auth node failed, clientIDKey[%v], err[%v]", clientIDKey, err.Error())
return
}
_, err = checkTicket(ticket.Ticket, c.MasterSecretKey, Type)
if err != nil {
err = fmt.Errorf("check ticket failed, clientIDKey[%v], err[%v]", clientIDKey, err.Error())
return
}
return
}
func (c *Cluster) addLcNode(nodeAddr string) (id uint64, err error) {
var ln *LcNode
if value, ok := c.lcNodes.Load(nodeAddr); ok {
ln = value.(*LcNode)
ln.ReportTime = time.Now()
ln.clean()
ln.TaskManager = newAdminTaskManager(ln.Addr, c.Name)
log.LogInfof("action[addLcNode] already add nodeAddr: %v, id: %v", nodeAddr, ln.ID)
} else {
ln = newLcNode(nodeAddr, c.Name)
// allocate LcNode id
if id, err = c.idAlloc.allocateCommonID(); err != nil {
goto errHandler
}
ln.ID = id
log.LogInfof("action[addLcNode] allocateCommonID: %v", id)
}
if err = c.syncAddLcNode(ln); err != nil {
goto errHandler
}
c.lcNodes.Store(nodeAddr, ln)
c.lcMgr.lcNodeStatus.Lock()
c.lcMgr.lcNodeStatus.WorkingCount[nodeAddr] = 0
c.lcMgr.lcNodeStatus.Unlock()
c.snapshotMgr.lcNodeStatus.Lock()
c.snapshotMgr.lcNodeStatus.WorkingCount[nodeAddr] = 0
c.snapshotMgr.lcNodeStatus.Unlock()
log.LogInfof("action[addLcNode], clusterID[%v], lcNodeAddr: %v, id: %v, add idleNodes", c.Name, nodeAddr, ln.ID)
return ln.ID, nil
errHandler:
err = fmt.Errorf("action[addLcNode],clusterID[%v] lcNodeAddr:%v err:%v ", c.Name, nodeAddr, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
return
}
type LcNodeStatInfo struct {
Addr string
}
type LcNodeInfoResponse struct {
RegisterInfos []*LcNodeStatInfo
LcConfigurations map[string]*proto.LcConfiguration
LcRuleTaskStatus lcRuleTaskStatus
LcNodeStatus lcNodeStatus
SnapshotVerStatus lcSnapshotVerStatus
SnapshotNodeStatus lcNodeStatus
}
func (c *Cluster) getAllLcNodeInfo() (rsp *LcNodeInfoResponse, err error) {
rsp = &LcNodeInfoResponse{}
c.lcNodes.Range(func(addr, value interface{}) bool {
rsp.RegisterInfos = append(rsp.RegisterInfos, &LcNodeStatInfo{
Addr: addr.(string),
})
return true
})
var b []byte
c.lcMgr.RLock()
if b, err = json.Marshal(c.lcMgr.lcConfigurations); err != nil {
c.lcMgr.RUnlock()
return
}
c.lcMgr.RUnlock()
if err = json.Unmarshal(b, &rsp.LcConfigurations); err != nil {
return
}
c.lcMgr.lcRuleTaskStatus.RLock()
if b, err = json.Marshal(c.lcMgr.lcRuleTaskStatus); err != nil {
c.lcMgr.lcRuleTaskStatus.RUnlock()
return
}
c.lcMgr.lcRuleTaskStatus.RUnlock()
if err = json.Unmarshal(b, &rsp.LcRuleTaskStatus); err != nil {
return
}
c.lcMgr.lcNodeStatus.RLock()
if b, err = json.Marshal(c.lcMgr.lcNodeStatus); err != nil {
c.lcMgr.lcNodeStatus.RUnlock()
return
}
c.lcMgr.lcNodeStatus.RUnlock()
if err = json.Unmarshal(b, &rsp.LcNodeStatus); err != nil {
return
}
c.snapshotMgr.lcSnapshotTaskStatus.RLock()
if b, err = json.Marshal(c.snapshotMgr.lcSnapshotTaskStatus); err != nil {
c.snapshotMgr.lcSnapshotTaskStatus.RUnlock()
return
}
c.snapshotMgr.lcSnapshotTaskStatus.RUnlock()
if err = json.Unmarshal(b, &rsp.SnapshotVerStatus); err != nil {
return
}
c.snapshotMgr.lcNodeStatus.RLock()
if b, err = json.Marshal(c.snapshotMgr.lcNodeStatus); err != nil {
c.snapshotMgr.lcNodeStatus.RUnlock()
return
}
c.snapshotMgr.lcNodeStatus.RUnlock()
if err = json.Unmarshal(b, &rsp.SnapshotNodeStatus); err != nil {
return
}
return
}
func (c *Cluster) clearLcNodes() {
c.lcNodes.Range(func(key, value interface{}) bool {
lcNode := value.(*LcNode)
c.lcNodes.Delete(key)
lcNode.clean()
return true
})
}
func (c *Cluster) delLcNode(nodeAddr string) (err error) {
c.lcMgr.lcNodeStatus.RemoveNode(nodeAddr)
c.snapshotMgr.lcNodeStatus.RemoveNode(nodeAddr)
lcNode, err := c.lcNode(nodeAddr)
if err != nil {
log.LogErrorf("action[delLcNode], clusterID:%v, lcNodeAddr:%v, load err:%v ", c.Name, nodeAddr, err)
return
}
if err = c.syncDeleteLcNode(lcNode); err != nil {
log.LogErrorf("action[delLcNode], clusterID:%v, lcNodeAddr:%v syncDeleteLcNode err:%v ", c.Name, nodeAddr, err)
return
}
val, loaded := c.lcNodes.LoadAndDelete(nodeAddr)
log.LogInfof("action[delLcNode], clusterID:%v, lcNodeAddr:%v, LoadAndDelete result val:%v, loaded:%v", c.Name, nodeAddr, val, loaded)
return
}
func (c *Cluster) scheduleToLcScan() {
go func() {
for {
now := time.Now()
next := now.Add(time.Hour * 24)
next = time.Date(next.Year(), next.Month(), next.Day(), 1, 0, 0, 0, next.Location())
t := time.NewTimer(next.Sub(now))
<-t.C
if c.partition != nil && c.partition.IsRaftLeader() {
c.startLcScan()
}
}
}()
}
func (c *Cluster) startLcScan() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("startLcScan occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"startLcScan occurred panic")
}
}()
c.lcMgr.startLcScan()
}
func (c *Cluster) scheduleToSnapshotDelVerScan() {
go c.snapshotMgr.process()
// make sure resume all the processing ver deleting tasks before checking
waitTime := time.Second * defaultIntervalToCheck
waited := false
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
if !waited {
log.LogInfof("wait for %v seconds once after becoming leader to make sure all the ver deleting tasks are resumed",
waitTime)
time.Sleep(waitTime)
waited = true
}
c.getSnapshotDelVer()
}
time.Sleep(waitTime)
}
}()
}
func (c *Cluster) getSnapshotDelVer() {
if c.partition == nil || !c.partition.IsRaftLeader() {
log.LogWarn("getSnapshotDelVer: master is not leader")
return
}
c.snapshotMgr.lcSnapshotTaskStatus.ResetVerInfos()
vols := c.allVols()
for volName, vol := range vols {
volVerInfoList := vol.VersionMgr.getVersionList()
for _, volVerInfo := range volVerInfoList.VerList {
if volVerInfo.Status == proto.VersionDeleting {
task := &proto.SnapshotVerDelTask{
Id: fmt.Sprintf("%s:%d", volName, volVerInfo.Ver),
VolName: volName,
VolVersionInfo: volVerInfo,
}
c.snapshotMgr.lcSnapshotTaskStatus.AddVerInfo(task)
}
}
}
log.LogDebug("getSnapshotDelVer AddVerInfo finish")
c.snapshotMgr.lcSnapshotTaskStatus.DeleteOldResult()
log.LogDebug("getSnapshotDelVer DeleteOldResult finish")
}
func (c *Cluster) SetBucketLifecycle(req *proto.LcConfiguration) error {
lcConf := &proto.LcConfiguration{
VolName: req.VolName,
Rules: req.Rules,
}
if c.lcMgr.GetS3BucketLifecycle(req.VolName) != nil {
if err := c.syncUpdateLcConf(lcConf); err != nil {
err = fmt.Errorf("action[SetS3BucketLifecycle],clusterID[%v] vol:%v err:%v ", c.Name, lcConf.VolName, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
}
} else {
if err := c.syncAddLcConf(lcConf); err != nil {
err = fmt.Errorf("action[SetS3BucketLifecycle],clusterID[%v] vol:%v err:%v ", c.Name, lcConf.VolName, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
}
}
_ = c.lcMgr.SetS3BucketLifecycle(lcConf)
log.LogInfof("action[SetS3BucketLifecycle],clusterID[%v] vol:%v", c.Name, lcConf.VolName)
return nil
}
func (c *Cluster) GetBucketLifecycle(VolName string) (lcConf *proto.LcConfiguration) {
lcConf = c.lcMgr.GetS3BucketLifecycle(VolName)
log.LogInfof("action[GetS3BucketLifecycle],clusterID[%v] vol:%v", c.Name, VolName)
return
}
func (c *Cluster) DelBucketLifecycle(VolName string) {
lcConf := &proto.LcConfiguration{
VolName: VolName,
}
if err := c.syncDeleteLcConf(lcConf); err != nil {
err = fmt.Errorf("action[DelS3BucketLifecycle],clusterID[%v] vol:%v err:%v ", c.Name, VolName, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
}
c.lcMgr.DelS3BucketLifecycle(VolName)
log.LogInfof("action[DelS3BucketLifecycle],clusterID[%v] vol:%v", c.Name, VolName)
return
}
func (c *Cluster) addDecommissionDiskToNodeset(dd *DecommissionDisk) (err error) {
var (
node *DataNode
zone *Zone
ns *nodeSet
)
if node, err = c.dataNode(dd.SrcAddr); err != nil {
log.LogWarnf("action[TryDecommissionDisk] cannot find dataNode[%s]", dd.SrcAddr)
return
}
if zone, err = c.t.getZone(node.ZoneName); err != nil {
log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] zone failed[%v]",
node.Addr, err.Error())
return
}
if ns, err = zone.getNodeSet(node.NodeSetID); err != nil {
log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] nodeset[%v] failed[%v]",
node.Addr, node.NodeSetID, err.Error())
return
}
ns.AddDecommissionDisk(dd)
return nil
}
func (c *Cluster) AutoDecommissionDiskIsEnabled() bool {
c.AutoDecommissionDiskMux.Lock()
defer c.AutoDecommissionDiskMux.Unlock()
return c.EnableAutoDecommissionDisk
}
func (c *Cluster) SetAutoDecommissionDisk(flag bool) {
c.AutoDecommissionDiskMux.Lock()
defer c.AutoDecommissionDiskMux.Unlock()
c.EnableAutoDecommissionDisk = flag
}
func (c *Cluster) GetDecommissionDataPartitionRecoverTimeOut() time.Duration {
if c.cfg.DpRepairTimeOut == 0 {
return time.Hour * 2
} else {
return time.Second * time.Duration(c.cfg.DpRepairTimeOut)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"strconv"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
type nodeStatInfo = proto.NodeStatInfo
type volStatInfo = proto.VolStatInfo
func newVolStatInfo(name string, total, used, cacheTotal, cacheUsed, inodeCount uint64) *volStatInfo {
usedRatio := strconv.FormatFloat(float64(used)/float64(total), 'f', 3, 32)
cacheUsedRatio := "0.00"
if cacheTotal > 0 {
strconv.FormatFloat(float64(cacheUsed)/float64(cacheTotal), 'f', 3, 32)
}
return &volStatInfo{
Name: name,
TotalSize: total,
UsedSize: used,
UsedRatio: usedRatio,
CacheTotalSize: cacheTotal,
CacheUsedSize: cacheUsed,
CacheUsedRatio: cacheUsedRatio,
InodeCount: inodeCount,
}
}
func newZoneStatInfo() *proto.ZoneStat {
return &proto.ZoneStat{DataNodeStat: new(proto.ZoneNodesStat), MetaNodeStat: new(proto.ZoneNodesStat)}
}
// Check the total space, available space, and daily-used space in data nodes, meta nodes, and volumes
func (c *Cluster) updateStatInfo() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("updateStatInfo occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"updateStatInfo occurred panic")
}
}()
c.updateDataNodeStatInfo()
c.updateMetaNodeStatInfo()
c.updateVolStatInfo()
c.updateZoneStatInfo()
}
func (c *Cluster) updateZoneStatInfo() {
for _, zone := range c.t.zones {
zs := newZoneStatInfo()
c.zoneStatInfos[zone.name] = zs
zone.dataNodes.Range(func(key, value interface{}) bool {
zs.DataNodeStat.TotalNodes++
node := value.(*DataNode)
if node.isActive && node.isWriteAble() {
zs.DataNodeStat.WritableNodes++
}
zs.DataNodeStat.Total += float64(node.Total) / float64(util.GB)
zs.DataNodeStat.Used += float64(node.Used) / float64(util.GB)
return true
})
zs.DataNodeStat.Total = fixedPoint(zs.DataNodeStat.Total, 2)
zs.DataNodeStat.Used = fixedPoint(zs.DataNodeStat.Used, 2)
zs.DataNodeStat.Avail = fixedPoint(zs.DataNodeStat.Total-zs.DataNodeStat.Used, 2)
if zs.DataNodeStat.Total == 0 {
zs.DataNodeStat.Total = 1
}
zs.DataNodeStat.UsedRatio = fixedPoint(float64(zs.DataNodeStat.Used)/float64(zs.DataNodeStat.Total), 2)
zone.metaNodes.Range(func(key, value interface{}) bool {
zs.MetaNodeStat.TotalNodes++
node := value.(*MetaNode)
if node.IsActive && node.isWritable() {
zs.MetaNodeStat.WritableNodes++
}
zs.MetaNodeStat.Total += float64(node.Total) / float64(util.GB)
zs.MetaNodeStat.Used += float64(node.Used) / float64(util.GB)
return true
})
zs.MetaNodeStat.Total = fixedPoint(zs.MetaNodeStat.Total, 2)
zs.MetaNodeStat.Used = fixedPoint(zs.MetaNodeStat.Used, 2)
zs.MetaNodeStat.Avail = fixedPoint(zs.MetaNodeStat.Total-zs.MetaNodeStat.Used, 2)
if zs.MetaNodeStat.Total == 0 {
zs.MetaNodeStat.Total = 1
}
zs.MetaNodeStat.UsedRatio = fixedPoint(float64(zs.MetaNodeStat.Used)/float64(zs.MetaNodeStat.Total), 2)
}
}
func fixedPoint(x float64, scale int) float64 {
decimal := math.Pow10(scale)
return float64(int(math.Round(x*decimal))) / decimal
}
func (c *Cluster) updateDataNodeStatInfo() {
var (
total uint64
used uint64
avail uint64
)
c.dataNodes.Range(func(addr, node interface{}) bool {
dataNode := node.(*DataNode)
total = total + dataNode.Total
used = used + dataNode.Used
if dataNode.isActive {
avail = avail + dataNode.AvailableSpace
}
return true
})
if total <= 0 {
return
}
usedRate := float64(used) / float64(total)
if usedRate > spaceAvailableRate {
Warn(c.Name, fmt.Sprintf("clusterId[%v] space utilization reached [%v],usedSpace[%v],totalSpace[%v] please add dataNode",
c.Name, usedRate, used, total))
}
c.dataNodeStatInfo.TotalGB = total / util.GB
c.dataNodeStatInfo.AvailGB = avail / util.GB
usedGB := used / util.GB
c.dataNodeStatInfo.IncreasedGB = int64(usedGB) - int64(c.dataNodeStatInfo.UsedGB)
c.dataNodeStatInfo.UsedGB = usedGB
c.dataNodeStatInfo.UsedRatio = strconv.FormatFloat(usedRate, 'f', 3, 32)
}
func (c *Cluster) updateMetaNodeStatInfo() {
var (
total uint64
used uint64
avail uint64
)
c.metaNodes.Range(func(addr, node interface{}) bool {
metaNode := node.(*MetaNode)
total = total + metaNode.Total
used = used + metaNode.Used
if metaNode.IsActive {
avail = avail + metaNode.MaxMemAvailWeight
}
return true
})
if total <= 0 {
return
}
useRate := float64(used) / float64(total)
if useRate > spaceAvailableRate {
Warn(c.Name, fmt.Sprintf("clusterId[%v] space utilization reached [%v],usedSpace[%v],totalSpace[%v] please add metaNode",
c.Name, useRate, used, total))
}
c.metaNodeStatInfo.TotalGB = total / util.GB
c.metaNodeStatInfo.AvailGB = avail / util.GB
newUsed := used / util.GB
c.metaNodeStatInfo.IncreasedGB = int64(newUsed) - int64(c.metaNodeStatInfo.UsedGB)
c.metaNodeStatInfo.UsedGB = newUsed
c.metaNodeStatInfo.UsedRatio = strconv.FormatFloat(useRate, 'f', 3, 32)
}
func (c *Cluster) updateVolStatInfo() {
vols := c.copyVols()
for _, vol := range vols {
used, total := vol.totalUsedSpace(), vol.Capacity*util.GB
if total <= 0 {
continue
}
cacheUsed, cacheTotal := vol.cfsUsedSpace(), vol.CacheCapacity*util.GB
if proto.IsHot(vol.VolType) {
cacheUsed, cacheTotal = 0, 0
}
var inodeCount uint64
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
inodeCount += mp.InodeCount
}
vol.mpsLock.RUnlock()
c.volStatInfo.Store(vol.Name, newVolStatInfo(vol.Name, total, used, cacheTotal, cacheUsed, inodeCount))
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"runtime"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
func (c *Cluster) addDataNodeTasks(tasks []*proto.AdminTask) {
for _, t := range tasks {
c.addDataNodeTask(t)
}
}
func (c *Cluster) addDataNodeTask(task *proto.AdminTask) {
if task == nil {
return
}
if node, err := c.dataNode(task.OperatorAddr); err != nil {
log.LogWarn(fmt.Sprintf("action[putTasks],nodeAddr:%v,taskID:%v,err:%v", task.OperatorAddr, task.ID, err))
} else {
node.TaskManager.AddTask(task)
}
}
func (c *Cluster) addMetaNodeTasks(tasks []*proto.AdminTask) {
for _, t := range tasks {
if t == nil {
continue
}
if node, err := c.metaNode(t.OperatorAddr); err != nil {
log.LogWarn(fmt.Sprintf("action[putTasks],nodeAddr:%v,taskID:%v,err:%v", t.OperatorAddr, t.ID, err.Error()))
} else {
node.Sender.AddTask(t)
}
}
}
func (c *Cluster) addLcNodeTasks(tasks []*proto.AdminTask) {
for _, t := range tasks {
if t == nil {
continue
}
if node, err := c.lcNode(t.OperatorAddr); err != nil {
log.LogWarn(fmt.Sprintf("action[putTasks],nodeAddr:%v,taskID:%v,err:%v", t.OperatorAddr, t.ID, err.Error()))
} else {
node.TaskManager.AddTask(t)
}
}
}
func (c *Cluster) waitForResponseToLoadDataPartition(partitions []*DataPartition) {
var wg sync.WaitGroup
for _, dp := range partitions {
wg.Add(1)
go func(dp *DataPartition) {
defer func() {
wg.Done()
if err := recover(); err != nil {
const size = runtimeStackBufSize
buf := make([]byte, size)
buf = buf[:runtime.Stack(buf, false)]
log.LogError(fmt.Sprintf("doLoadDataPartition panic %v: %s\n", err, buf))
}
}()
c.doLoadDataPartition(dp)
}(dp)
}
wg.Wait()
}
func (c *Cluster) loadDataPartition(dp *DataPartition) {
go func() {
c.doLoadDataPartition(dp)
}()
}
func (c *Cluster) migrateMetaPartition(srcAddr, targetAddr string, mp *MetaPartition) (err error) {
var (
newPeers []proto.Peer
metaNode *MetaNode
zone *Zone
ns *nodeSet
excludeNodeSets []uint64
oldHosts []string
zones []string
)
log.LogWarnf("action[migrateMetaPartition],volName[%v], migrate from src[%s] to target[%s],partitionID[%v] begin",
mp.volName, srcAddr, targetAddr, mp.PartitionID)
mp.RLock()
if !contains(mp.Hosts, srcAddr) {
mp.RUnlock()
log.LogErrorf("action[migrateMetaPartition],volName[%v], src[%s] not exist, partitionID[%v]",
mp.volName, srcAddr, mp.PartitionID)
return fmt.Errorf("migrateMetaPartition src [%s] is not exist in mp(%d)", srcAddr, mp.PartitionID)
}
oldHosts = mp.Hosts
mp.RUnlock()
if err = c.validateDecommissionMetaPartition(mp, srcAddr, false); err != nil {
goto errHandler
}
if metaNode, err = c.metaNode(srcAddr); err != nil {
goto errHandler
}
if zone, err = c.t.getZone(metaNode.ZoneName); err != nil {
goto errHandler
}
if ns, err = zone.getNodeSet(metaNode.NodeSetID); err != nil {
goto errHandler
}
if targetAddr != "" {
newPeers = []proto.Peer{{
Addr: targetAddr,
}}
} else if _, newPeers, err = ns.getAvailMetaNodeHosts(oldHosts, 1); err != nil {
if _, ok := c.vols[mp.volName]; !ok {
log.LogWarnf("[migrateMetaPartition] clusterID[%v] partitionID:%v on node:[%v]",
c.Name, mp.PartitionID, mp.Hosts)
return
}
if c.isFaultDomain(c.vols[mp.volName]) {
log.LogWarnf("[migrateMetaPartition] clusterID[%v] partitionID:%v on node:[%v]",
c.Name, mp.PartitionID, mp.Hosts)
return
}
// choose a meta node in other node set in the same zone
excludeNodeSets = append(excludeNodeSets, ns.ID)
if _, newPeers, err = zone.getAvailNodeHosts(TypeMetaPartition, excludeNodeSets, oldHosts, 1); err != nil {
zones = mp.getLiveZones(srcAddr)
var excludeZone []string
if len(zones) == 0 {
excludeZone = append(excludeZone, zone.name)
} else {
excludeZone = append(excludeZone, zones[0])
}
// choose a meta node in other zone
if _, newPeers, err = c.getHostFromNormalZone(TypeMetaPartition, excludeZone, excludeNodeSets, oldHosts, 1, 1, ""); err != nil {
goto errHandler
}
}
}
if err = c.deleteMetaReplica(mp, srcAddr, false, false); err != nil {
goto errHandler
}
if err = c.addMetaReplica(mp, newPeers[0].Addr); err != nil {
goto errHandler
}
mp.IsRecover = true
c.putBadMetaPartitions(srcAddr, mp.PartitionID)
mp.RLock()
c.syncUpdateMetaPartition(mp)
mp.RUnlock()
Warn(c.Name, fmt.Sprintf("action[migrateMetaPartition] clusterID[%v] vol[%v] meta partition[%v] "+
"migrate addr[%v] success,new addr[%v]", c.Name, mp.volName, mp.PartitionID, srcAddr, newPeers[0].Addr))
return
errHandler:
msg := fmt.Sprintf("action[migrateMetaPartition],volName: %v,partitionID: %v,err: %v", mp.volName, mp.PartitionID, errors.Stack(err))
log.LogError(msg)
Warn(c.Name, msg)
if err != nil {
err = fmt.Errorf("action[migrateMetaPartition] vol[%v],partition[%v],err[%v]", mp.volName, mp.PartitionID, err)
}
return
}
// taking the given mata partition offline.
// 1. checking if the meta partition can be offline.
// There are two cases where the partition is not allowed to be offline:
// (1) the replica is not in the latest host list
// (2) there are too few replicas
// 2. choosing a new available meta node
// 3. synchronized decommission meta partition
// 4. synchronized create a new meta partition
// 5. persistent the new host list
func (c *Cluster) decommissionMetaPartition(nodeAddr string, mp *MetaPartition) (err error) {
if c.ForbidMpDecommission {
err = fmt.Errorf("cluster mataPartition decommission switch is disabled")
return
}
return c.migrateMetaPartition(nodeAddr, "", mp)
}
func (c *Cluster) validateDecommissionMetaPartition(mp *MetaPartition, nodeAddr string, forceDel bool) (err error) {
mp.RLock()
defer mp.RUnlock()
var vol *Vol
if vol, err = c.getVol(mp.volName); err != nil {
return
}
if err = mp.canBeOffline(nodeAddr, int(vol.mpReplicaNum)); err != nil {
return
}
if forceDel {
log.LogWarnf("action[validateDecommissionMetaPartition] mp relica be force delete without check missing and recovery status")
return
}
if err = mp.hasMissingOneReplica(nodeAddr, int(vol.mpReplicaNum)); err != nil {
return
}
if mp.IsRecover && !mp.activeMaxInodeSimilar() {
err = fmt.Errorf("vol[%v],meta partition[%v] is recovering,[%v] can't be decommissioned", vol.Name, mp.PartitionID, nodeAddr)
return
}
return
}
func (c *Cluster) checkInactiveMetaNodes() (inactiveMetaNodes []string, err error) {
inactiveMetaNodes = make([]string, 0)
c.metaNodes.Range(func(addr, node interface{}) bool {
metaNode := node.(*MetaNode)
if !metaNode.IsActive {
inactiveMetaNodes = append(inactiveMetaNodes, metaNode.Addr)
}
return true
})
log.LogInfof("clusterID[%v] inactiveMetaNodes:%v", c.Name, inactiveMetaNodes)
return
}
// check corrupt partitions related to this meta node
func (c *Cluster) checkCorruptMetaNode(metaNode *MetaNode) (corruptPartitions []*MetaPartition, err error) {
var (
partition *MetaPartition
mn *MetaNode
corruptPids []uint64
corruptReplicaNum uint8
)
metaNode.RLock()
defer metaNode.RUnlock()
for _, pid := range metaNode.PersistenceMetaPartitions {
corruptReplicaNum = 0
if partition, err = c.getMetaPartitionByID(pid); err != nil {
return
}
for _, host := range partition.Hosts {
if mn, err = c.metaNode(host); err != nil {
return
}
if !mn.IsActive {
corruptReplicaNum = corruptReplicaNum + 1
}
}
if corruptReplicaNum > partition.ReplicaNum/2 {
corruptPartitions = append(corruptPartitions, partition)
corruptPids = append(corruptPids, pid)
}
}
log.LogInfof("action[checkCorruptMetaNode],clusterID[%v] metaNodeAddr:[%v], corrupt partitions%v",
c.Name, metaNode.Addr, corruptPids)
return
}
type VolNameSet map[string]struct{}
func (c *Cluster) checkReplicaMetaPartitions() (
lackReplicaMetaPartitions []*MetaPartition, noLeaderMetaPartitions []*MetaPartition,
unavailableReplicaMPs []*MetaPartition, excessReplicaMetaPartitions, inodeCountNotEqualMPs, maxInodeNotEqualMPs, dentryCountNotEqualMPs []*MetaPartition, err error) {
lackReplicaMetaPartitions = make([]*MetaPartition, 0)
noLeaderMetaPartitions = make([]*MetaPartition, 0)
excessReplicaMetaPartitions = make([]*MetaPartition, 0)
inodeCountNotEqualMPs = make([]*MetaPartition, 0)
maxInodeNotEqualMPs = make([]*MetaPartition, 0)
dentryCountNotEqualMPs = make([]*MetaPartition, 0)
markDeleteVolNames := make(VolNameSet)
vols := c.copyVols()
for _, vol := range vols {
if vol.Status == proto.VolStatusMarkDelete {
markDeleteVolNames[vol.Name] = struct{}{}
continue
}
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
if uint8(len(mp.Hosts)) < mp.ReplicaNum || uint8(len(mp.Replicas)) < mp.ReplicaNum {
lackReplicaMetaPartitions = append(lackReplicaMetaPartitions, mp)
}
if !mp.isLeaderExist() && (time.Now().Unix()-mp.LeaderReportTime > c.cfg.MpNoLeaderReportIntervalSec) {
noLeaderMetaPartitions = append(noLeaderMetaPartitions, mp)
}
if uint8(len(mp.Hosts)) > mp.ReplicaNum || uint8(len(mp.Replicas)) > mp.ReplicaNum {
excessReplicaMetaPartitions = append(excessReplicaMetaPartitions, mp)
}
for _, replica := range mp.Replicas {
if replica.Status == proto.Unavailable {
unavailableReplicaMPs = append(unavailableReplicaMPs, mp)
break
}
}
}
vol.mpsLock.RUnlock()
}
c.inodeCountNotEqualMP.Range(func(key, value interface{}) bool {
mp := value.(*MetaPartition)
if _, ok := markDeleteVolNames[mp.volName]; !ok {
inodeCountNotEqualMPs = append(inodeCountNotEqualMPs, mp)
}
return true
})
c.maxInodeNotEqualMP.Range(func(key, value interface{}) bool {
mp := value.(*MetaPartition)
if _, ok := markDeleteVolNames[mp.volName]; !ok {
maxInodeNotEqualMPs = append(maxInodeNotEqualMPs, mp)
}
return true
})
c.dentryCountNotEqualMP.Range(func(key, value interface{}) bool {
mp := value.(*MetaPartition)
if _, ok := markDeleteVolNames[mp.volName]; !ok {
dentryCountNotEqualMPs = append(dentryCountNotEqualMPs, mp)
}
return true
})
log.LogInfof("clusterID[%v], lackReplicaMetaPartitions count:[%v], noLeaderMetaPartitions count[%v]"+
"unavailableReplicaMPs count:[%v], excessReplicaMp count:[%v]",
c.Name, len(lackReplicaMetaPartitions), len(noLeaderMetaPartitions),
len(unavailableReplicaMPs), len(excessReplicaMetaPartitions))
return
}
func (c *Cluster) deleteMetaReplica(partition *MetaPartition, addr string, validate bool, forceDel bool) (err error) {
defer func() {
if err != nil {
log.LogErrorf("action[deleteMetaReplica],vol[%v],data partition[%v],err[%v]", partition.volName, partition.PartitionID, err)
}
}()
if validate {
if err = c.validateDecommissionMetaPartition(partition, addr, forceDel); err != nil {
return
}
}
metaNode, err := c.metaNode(addr)
if err != nil {
return
}
removePeer := proto.Peer{ID: metaNode.ID, Addr: addr}
if err = c.removeMetaPartitionRaftMember(partition, removePeer); err != nil {
return
}
if err = c.deleteMetaPartition(partition, metaNode); err != nil {
return
}
return
}
func (c *Cluster) deleteMetaPartition(partition *MetaPartition, removeMetaNode *MetaNode) (err error) {
partition.Lock()
mr, err := partition.getMetaReplica(removeMetaNode.Addr)
if err != nil {
partition.Unlock()
log.LogErrorf("action[deleteMetaPartition] vol[%v],meta partition[%v], err[%v]", partition.volName, partition.PartitionID, err)
return nil
}
task := mr.createTaskToDeleteReplica(partition.PartitionID)
partition.removeReplicaByAddr(removeMetaNode.Addr)
partition.removeMissingReplica(removeMetaNode.Addr)
partition.Unlock()
_, err = removeMetaNode.Sender.syncSendAdminTask(task)
if err != nil {
log.LogErrorf("action[deleteMetaPartition] vol[%v],meta partition[%v],err[%v]", partition.volName, partition.PartitionID, err)
}
return nil
}
func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, removePeer proto.Peer) (err error) {
partition.offlineMutex.Lock()
defer partition.offlineMutex.Unlock()
defer func() {
if err1 := c.updateMetaPartitionOfflinePeerIDWithLock(partition, 0); err1 != nil {
err = errors.Trace(err, "updateMetaPartitionOfflinePeerIDWithLock failed, err[%v]", err1)
}
}()
if err = c.updateMetaPartitionOfflinePeerIDWithLock(partition, removePeer.ID); err != nil {
return
}
mr, err := partition.getMetaReplicaLeader()
if err != nil {
return
}
t, err := partition.createTaskToRemoveRaftMember(removePeer)
if err != nil {
return
}
var leaderMetaNode *MetaNode
leaderMetaNode = mr.metaNode
if leaderMetaNode == nil {
leaderMetaNode, err = c.metaNode(mr.Addr)
if err != nil {
return
}
}
if _, err = leaderMetaNode.Sender.syncSendAdminTask(t); err != nil {
return
}
newHosts := make([]string, 0, len(partition.Hosts)-1)
newPeers := make([]proto.Peer, 0, len(partition.Hosts)-1)
for _, host := range partition.Hosts {
if host == removePeer.Addr {
continue
}
newHosts = append(newHosts, host)
}
for _, peer := range partition.Peers {
if peer.Addr == removePeer.Addr && peer.ID == removePeer.ID {
continue
}
newPeers = append(newPeers, peer)
}
if err = partition.persistToRocksDB("removeMetaPartitionRaftMember", partition.volName, newHosts, newPeers, c); err != nil {
return
}
if mr.Addr != removePeer.Addr {
return
}
metaNode, err := c.metaNode(partition.Hosts[0])
if err != nil {
return
}
if err = partition.tryToChangeLeader(c, metaNode); err != nil {
return
}
return
}
func (c *Cluster) updateMetaPartitionOfflinePeerIDWithLock(mp *MetaPartition, peerID uint64) (err error) {
mp.Lock()
defer mp.Unlock()
mp.OfflinePeerID = peerID
if err = mp.persistToRocksDB("updateMetaPartitionOfflinePeerIDWithLock", mp.volName, mp.Hosts, mp.Peers, c); err != nil {
return
}
return
}
func (c *Cluster) addMetaReplica(partition *MetaPartition, addr string) (err error) {
defer func() {
if err != nil {
log.LogErrorf("action[addMetaReplica],vol[%v],data partition[%v],err[%v]", partition.volName, partition.PartitionID, err)
}
}()
partition.Lock()
defer partition.Unlock()
if contains(partition.Hosts, addr) {
err = fmt.Errorf("vol[%v],mp[%v] has contains host[%v]", partition.volName, partition.PartitionID, addr)
return
}
metaNode, err := c.metaNode(addr)
if err != nil {
return
}
addPeer := proto.Peer{ID: metaNode.ID, Addr: addr}
if err = c.addMetaPartitionRaftMember(partition, addPeer); err != nil {
return
}
newHosts := make([]string, 0, len(partition.Hosts)+1)
newPeers := make([]proto.Peer, 0, len(partition.Hosts)+1)
newHosts = append(partition.Hosts, addPeer.Addr)
newPeers = append(partition.Peers, addPeer)
if err = partition.persistToRocksDB("addMetaReplica", partition.volName, newHosts, newPeers, c); err != nil {
return
}
if err = c.createMetaReplica(partition, addPeer); err != nil {
return
}
if err = partition.afterCreation(addPeer.Addr, c); err != nil {
return
}
return
}
func (c *Cluster) createMetaReplica(partition *MetaPartition, addPeer proto.Peer) (err error) {
task, err := partition.createTaskToCreateReplica(addPeer.Addr)
if err != nil {
return
}
metaNode, err := c.metaNode(addPeer.Addr)
if err != nil {
return
}
if _, err = metaNode.Sender.syncSendAdminTask(task); err != nil {
return
}
return
}
func (c *Cluster) buildAddMetaPartitionRaftMemberTaskAndSyncSend(mp *MetaPartition, addPeer proto.Peer, leaderAddr string) (resp *proto.Packet, err error) {
defer func() {
var resultCode uint8
if resp != nil {
resultCode = resp.ResultCode
}
if err != nil {
log.LogErrorf("action[addMetaRaftMemberAndSend],vol[%v],meta partition[%v],resultCode[%v],err[%v]",
mp.volName, mp.PartitionID, resultCode, err)
} else {
log.LogWarnf("action[addMetaRaftMemberAndSend],vol[%v],meta partition[%v],resultCode[%v]",
mp.volName, mp.PartitionID, resultCode)
}
}()
t, err := mp.createTaskToAddRaftMember(addPeer, leaderAddr)
if err != nil {
return
}
leaderMetaNode, err := c.metaNode(leaderAddr)
if err != nil {
return
}
if resp, err = leaderMetaNode.Sender.syncSendAdminTask(t); err != nil {
return
}
return
}
func (c *Cluster) addMetaPartitionRaftMember(partition *MetaPartition, addPeer proto.Peer) (err error) {
var (
candidateAddrs []string
leaderAddr string
)
candidateAddrs = make([]string, 0, len(partition.Hosts))
leaderMr, err := partition.getMetaReplicaLeader()
if err == nil {
leaderAddr = leaderMr.Addr
if contains(partition.Hosts, leaderAddr) {
candidateAddrs = append(candidateAddrs, leaderAddr)
} else {
leaderAddr = ""
}
}
for _, host := range partition.Hosts {
if host == leaderAddr {
continue
}
candidateAddrs = append(candidateAddrs, host)
}
// send task to leader addr first,if need to retry,then send to other addr
for index, host := range candidateAddrs {
// wait for a new leader
if leaderAddr == "" && len(candidateAddrs) < int(partition.ReplicaNum) {
time.Sleep(retrySendSyncTaskInternal)
}
_, err = c.buildAddMetaPartitionRaftMemberTaskAndSyncSend(partition, addPeer, host)
if err == nil {
break
}
if index < len(candidateAddrs)-1 {
time.Sleep(retrySendSyncTaskInternal)
}
}
return
}
func (c *Cluster) loadMetaPartitionAndCheckResponse(mp *MetaPartition) {
go func() {
c.doLoadMetaPartition(mp)
}()
}
func (c *Cluster) doLoadMetaPartition(mp *MetaPartition) {
var wg sync.WaitGroup
mp.Lock()
hosts := make([]string, len(mp.Hosts))
copy(hosts, mp.Hosts)
mp.LoadResponse = make([]*proto.MetaPartitionLoadResponse, 0)
mp.Unlock()
errChannel := make(chan error, len(hosts))
for _, host := range hosts {
wg.Add(1)
go func(host string) {
defer func() {
wg.Done()
}()
mr, err := mp.getMetaReplica(host)
if err != nil {
errChannel <- err
return
}
task := mr.createTaskToLoadMetaPartition(mp.PartitionID)
response, err := mr.metaNode.Sender.syncSendAdminTask(task)
if err != nil {
errChannel <- err
return
}
loadResponse := &proto.MetaPartitionLoadResponse{}
if err = json.Unmarshal(response.Data, loadResponse); err != nil {
errChannel <- err
return
}
loadResponse.Addr = host
mp.addOrReplaceLoadResponse(loadResponse)
}(host)
}
wg.Wait()
select {
case err := <-errChannel:
msg := fmt.Sprintf("action[doLoadMetaPartition] vol[%v],mpID[%v],err[%v]", mp.volName, mp.PartitionID, err.Error())
Warn(c.Name, msg)
return
default:
}
mp.checkSnapshot(c)
}
func (c *Cluster) doLoadDataPartition(dp *DataPartition) {
log.LogInfo(fmt.Sprintf("action[doLoadDataPartition],partitionID:%v", dp.PartitionID))
if !dp.needsToCompareCRC() {
log.LogInfo(fmt.Sprintf("action[doLoadDataPartition],partitionID:%v isRecover[%v] don't need compare", dp.PartitionID, dp.isRecover))
return
}
dp.resetFilesWithMissingReplica()
loadTasks := dp.createLoadTasks()
c.addDataNodeTasks(loadTasks)
success := false
for i := 0; i < timeToWaitForResponse; i++ {
if dp.checkLoadResponse(c.cfg.DataPartitionTimeOutSec) {
success = true
break
}
time.Sleep(time.Second)
}
if !success {
return
}
dp.getFileCount()
if proto.IsNormalDp(dp.PartitionType) {
dp.validateCRC(c.Name)
dp.checkReplicaSize(c.Name, c.cfg.diffReplicaSpaceUsage)
}
dp.setToNormal()
}
func (c *Cluster) handleMetaNodeTaskResponse(nodeAddr string, task *proto.AdminTask) (err error) {
if task == nil {
return
}
log.LogDebugf(fmt.Sprintf("action[handleMetaNodeTaskResponse] receive Task response:%v from %v now:%v", task.IdString(), nodeAddr, time.Now().Unix()))
var metaNode *MetaNode
if metaNode, err = c.metaNode(nodeAddr); err != nil {
goto errHandler
}
metaNode.Sender.DelTask(task)
if err = unmarshalTaskResponse(task); err != nil {
goto errHandler
}
switch task.OpCode {
case proto.OpMetaNodeHeartbeat:
response := task.Response.(*proto.MetaNodeHeartbeatResponse)
err = c.dealMetaNodeHeartbeatResp(task.OperatorAddr, response)
case proto.OpDeleteMetaPartition:
response := task.Response.(*proto.DeleteMetaPartitionResponse)
err = c.dealDeleteMetaPartitionResp(task.OperatorAddr, response)
case proto.OpUpdateMetaPartition:
response := task.Response.(*proto.UpdateMetaPartitionResponse)
err = c.dealUpdateMetaPartitionResp(task.OperatorAddr, response)
case proto.OpVersionOperation:
response := task.Response.(*proto.MultiVersionOpResponse)
err = c.dealOpMetaNodeMultiVerResp(task.OperatorAddr, response)
default:
err := fmt.Errorf("unknown operate code %v", task.OpCode)
log.LogError(err)
}
if err != nil {
log.LogError(fmt.Sprintf("process task[%v] failed", task.ToString()))
} else {
log.LogInfof("process task:%v status:%v success", task.IdString(), task.Status)
}
return
errHandler:
log.LogError(fmt.Sprintf("action[handleMetaNodeTaskResponse],nodeAddr %v,taskId %v,err %v",
nodeAddr, task.IdString(), err.Error()))
return
}
func (c *Cluster) dealUpdateMetaPartitionResp(nodeAddr string, resp *proto.UpdateMetaPartitionResponse) (err error) {
if resp.Status == proto.TaskFailed {
msg := fmt.Sprintf("action[dealUpdateMetaPartitionResp],clusterID[%v] nodeAddr %v update meta partition failed,err %v",
c.Name, nodeAddr, resp.Result)
log.LogError(msg)
Warn(c.Name, msg)
}
return
}
func (c *Cluster) dealOpMetaNodeMultiVerResp(nodeAddr string, resp *proto.MultiVersionOpResponse) (err error) {
if resp.Status == proto.TaskFailed {
msg := fmt.Sprintf("action[dealOpMetaNodeMultiVerResp],clusterID[%v] volume [%v] nodeAddr %v operate meta partition snapshot version,err %v",
c.Name, resp.VolumeID, nodeAddr, resp.Result)
log.LogError(msg)
Warn(c.Name, msg)
}
var vol *Vol
if vol, err = c.getVol(resp.VolumeID); err != nil {
return
}
vol.VersionMgr.handleTaskRsp(resp, TypeMetaPartition)
return
}
func (c *Cluster) dealOpDataNodeMultiVerResp(nodeAddr string, resp *proto.MultiVersionOpResponse) (err error) {
if resp.Status == proto.TaskFailed {
msg := fmt.Sprintf("action[dealOpMetaNodeMultiVerResp],clusterID[%v] volume [%v] nodeAddr %v operate meta partition snapshot version,err %v",
c.Name, resp.VolumeID, nodeAddr, resp.Result)
log.LogError(msg)
Warn(c.Name, msg)
}
var vol *Vol
if vol, err = c.getVol(resp.VolumeID); err != nil {
return
}
vol.VersionMgr.handleTaskRsp(resp, TypeDataPartition)
return
}
func (c *Cluster) dealDeleteMetaPartitionResp(nodeAddr string, resp *proto.DeleteMetaPartitionResponse) (err error) {
if resp.Status == proto.TaskFailed {
msg := fmt.Sprintf("action[dealDeleteMetaPartitionResp],clusterID[%v] nodeAddr %v "+
"delete meta partition failed,err %v", c.Name, nodeAddr, resp.Result)
log.LogError(msg)
Warn(c.Name, msg)
return
}
var mr *MetaReplica
mp, err := c.getMetaPartitionByID(resp.PartitionID)
if err != nil {
goto errHandler
}
mp.Lock()
defer mp.Unlock()
if mr, err = mp.getMetaReplica(nodeAddr); err != nil {
goto errHandler
}
mp.removeReplica(mr)
return
errHandler:
log.LogError(fmt.Sprintf("dealDeleteMetaPartitionResp %v", err))
return
}
func (c *Cluster) dealMetaNodeHeartbeatResp(nodeAddr string, resp *proto.MetaNodeHeartbeatResponse) (err error) {
var (
metaNode *MetaNode
logMsg string
)
log.LogInfof("action[dealMetaNodeHeartbeatResp],clusterID[%v] receive nodeAddr[%v] heartbeat", c.Name, nodeAddr)
if resp.Status == proto.TaskFailed {
msg := fmt.Sprintf("action[dealMetaNodeHeartbeatResp],clusterID[%v] nodeAddr %v heartbeat failed,err %v",
c.Name, nodeAddr, resp.Result)
log.LogError(msg)
Warn(c.Name, msg)
return
}
if metaNode, err = c.metaNode(nodeAddr); err != nil {
goto errHandler
}
if metaNode.ToBeOffline {
log.LogInfof("action[dealMetaNodeHeartbeatResp] dataNode is toBeOffline, addr[%s]", nodeAddr)
return
}
if resp.ZoneName == "" {
resp.ZoneName = DefaultZoneName
}
if metaNode.ZoneName != resp.ZoneName {
c.t.deleteMetaNode(metaNode)
oldZoneName := metaNode.ZoneName
metaNode.ZoneName = resp.ZoneName
c.adjustMetaNode(metaNode)
log.LogWarnf("metaNode zone changed from [%v] to [%v]", oldZoneName, resp.ZoneName)
}
// change cpu util and io used
metaNode.CpuUtil.Store(resp.CpuUtil)
metaNode.updateMetric(resp, c.cfg.MetaNodeThreshold)
metaNode.setNodeActive()
if err = c.t.putMetaNode(metaNode); err != nil {
log.LogErrorf("action[dealMetaNodeHeartbeatResp],metaNode[%v] error[%v]", metaNode.Addr, err)
}
c.updateMetaNode(metaNode, resp.MetaPartitionReports, metaNode.reachesThreshold())
// todo remove, this no need set metaNode.metaPartitionInfos = nil
// metaNode.metaPartitionInfos = nil
logMsg = fmt.Sprintf("action[dealMetaNodeHeartbeatResp],metaNode:%v,zone[%v], ReportTime:%v success", metaNode.Addr, metaNode.ZoneName, time.Now().Unix())
log.LogInfof(logMsg)
return
errHandler:
logMsg = fmt.Sprintf("nodeAddr %v heartbeat error :%v", nodeAddr, errors.Stack(err))
log.LogError(logMsg)
return
}
func (c *Cluster) adjustMetaNode(metaNode *MetaNode) {
c.mnMutex.Lock()
defer c.mnMutex.Unlock()
oldNodeSetID := metaNode.NodeSetID
var err error
defer func() {
if err != nil {
err = fmt.Errorf("action[adjustMetaNode],clusterID[%v] addr:%v,zone[%v] err:%v ", c.Name, metaNode.Addr, metaNode.ZoneName, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
}
}()
var zone *Zone
zone, err = c.t.getZone(metaNode.ZoneName)
if err != nil {
zone = newZone(metaNode.ZoneName)
c.t.putZone(zone)
}
c.nsMutex.Lock()
ns := zone.getAvailNodeSetForMetaNode()
if ns == nil {
if ns, err = zone.createNodeSet(c); err != nil {
c.nsMutex.Unlock()
return
}
}
c.nsMutex.Unlock()
metaNode.NodeSetID = ns.ID
if err = c.syncUpdateMetaNode(metaNode); err != nil {
metaNode.NodeSetID = oldNodeSetID
return
}
if err = c.syncUpdateNodeSet(ns); err != nil {
return
}
err = c.t.putMetaNode(metaNode)
return
}
func (c *Cluster) handleDataNodeTaskResponse(nodeAddr string, task *proto.AdminTask) {
if task == nil {
log.LogInfof("action[handleDataNodeTaskResponse] receive addr[%v] task response,but task is nil", nodeAddr)
return
}
if log.EnableDebug() {
log.LogDebugf("action[handleDataNodeTaskResponse] receive addr[%v] task response:%v", nodeAddr, task.ToString())
}
var (
err error
dataNode *DataNode
)
if dataNode, err = c.dataNode(nodeAddr); err != nil {
goto errHandler
}
dataNode.TaskManager.DelTask(task)
if err = unmarshalTaskResponse(task); err != nil {
goto errHandler
}
switch task.OpCode {
case proto.OpDeleteDataPartition:
response := task.Response.(*proto.DeleteDataPartitionResponse)
err = c.dealDeleteDataPartitionResponse(task.OperatorAddr, response)
case proto.OpLoadDataPartition:
response := task.Response.(*proto.LoadDataPartitionResponse)
err = c.handleResponseToLoadDataPartition(task.OperatorAddr, response)
case proto.OpDataNodeHeartbeat:
response := task.Response.(*proto.DataNodeHeartbeatResponse)
err = c.handleDataNodeHeartbeatResp(task.OperatorAddr, response)
case proto.OpVersionOperation:
response := task.Response.(*proto.MultiVersionOpResponse)
err = c.dealOpDataNodeMultiVerResp(task.OperatorAddr, response)
default:
err = fmt.Errorf(fmt.Sprintf("unknown operate code %v", task.OpCode))
goto errHandler
}
if err != nil {
goto errHandler
}
return
errHandler:
log.LogErrorf("process task[%v] failed,err:%v", task.ToString(), err)
return
}
func (c *Cluster) dealDeleteDataPartitionResponse(nodeAddr string, resp *proto.DeleteDataPartitionResponse) (err error) {
var dp *DataPartition
if resp.Status == proto.TaskSucceeds {
if dp, err = c.getDataPartitionByID(resp.PartitionId); err != nil {
return
}
dp.Lock()
defer dp.Unlock()
dp.removeReplicaByAddr(nodeAddr)
} else {
Warn(c.Name, fmt.Sprintf("clusterID[%v] delete data partition[%v] failed,err[%v]", c.Name, nodeAddr, resp.Result))
}
return
}
func (c *Cluster) handleResponseToLoadDataPartition(nodeAddr string, resp *proto.LoadDataPartitionResponse) (err error) {
if resp.Status == proto.TaskFailed || resp.PartitionSnapshot == nil {
return
}
var (
dataNode *DataNode
dp *DataPartition
vol *Vol
)
if dataNode, err = c.dataNode(nodeAddr); err != nil {
return
}
if resp.VolName != "" {
vol, err = c.getVol(resp.VolName)
if err != nil {
return
}
dp, err = vol.getDataPartitionByID(resp.PartitionId)
} else {
dp, err = c.getDataPartitionByID(resp.PartitionId)
}
if err != nil {
return
}
dp.loadFile(dataNode, resp)
return
}
func (c *Cluster) handleDataNodeHeartbeatResp(nodeAddr string, resp *proto.DataNodeHeartbeatResponse) (err error) {
var (
dataNode *DataNode
logMsg string
)
log.LogInfof("action[handleDataNodeHeartbeatResp] clusterID[%v] receive dataNode[%v] heartbeat, ", c.Name, nodeAddr)
if resp.Status != proto.TaskSucceeds {
Warn(c.Name, fmt.Sprintf("action[handleDataNodeHeartbeatResp] clusterID[%v] dataNode[%v] heartbeat task failed",
c.Name, nodeAddr))
return
}
if dataNode, err = c.dataNode(nodeAddr); err != nil {
goto errHandler
}
if dataNode.ToBeOffline {
log.LogInfof("action[handleDataNodeHeartbeatResp] dataNode is toBeOffline, addr[%s]", nodeAddr)
// return
}
if resp.ZoneName == "" {
resp.ZoneName = DefaultZoneName
}
if dataNode.ZoneName != resp.ZoneName {
c.t.deleteDataNode(dataNode)
oldZoneName := dataNode.ZoneName
dataNode.ZoneName = resp.ZoneName
c.adjustDataNode(dataNode)
log.LogWarnf("dataNode [%v] zone changed from [%v] to [%v]", dataNode.Addr, oldZoneName, resp.ZoneName)
}
// change cpu util and io used
dataNode.CpuUtil.Store(resp.CpuUtil)
dataNode.SetIoUtils(resp.IoUtils)
dataNode.updateNodeMetric(resp)
if err = c.t.putDataNode(dataNode); err != nil {
log.LogErrorf("action[handleDataNodeHeartbeatResp] dataNode[%v],zone[%v],node set[%v], err[%v]", dataNode.Addr, dataNode.ZoneName, dataNode.NodeSetID, err)
}
c.updateDataNode(dataNode, resp.PartitionReports)
logMsg = fmt.Sprintf("action[handleDataNodeHeartbeatResp],dataNode:%v,zone[%v], ReportTime:%v success", dataNode.Addr, dataNode.ZoneName, time.Now().Unix())
log.LogInfof(logMsg)
return
errHandler:
logMsg = fmt.Sprintf("nodeAddr %v heartbeat error :%v", nodeAddr, err.Error())
log.LogError(logMsg)
return
}
func (c *Cluster) adjustDataNode(dataNode *DataNode) {
c.dnMutex.Lock()
defer c.dnMutex.Unlock()
oldNodeSetID := dataNode.NodeSetID
var err error
defer func() {
if err != nil {
err = fmt.Errorf("action[adjustDataNode],clusterID[%v] dataNodeAddr:%v,zone[%v] err:%v ", c.Name, dataNode.Addr, dataNode.ZoneName, err.Error())
log.LogError(errors.Stack(err))
Warn(c.Name, err.Error())
}
}()
var zone *Zone
zone, err = c.t.getZone(dataNode.ZoneName)
if err != nil {
zone = newZone(dataNode.ZoneName)
c.t.putZone(zone)
}
c.nsMutex.Lock()
ns := zone.getAvailNodeSetForDataNode()
if ns == nil {
if ns, err = zone.createNodeSet(c); err != nil {
c.nsMutex.Unlock()
return
}
}
c.nsMutex.Unlock()
dataNode.NodeSetID = ns.ID
if err = c.syncUpdateDataNode(dataNode); err != nil {
dataNode.NodeSetID = oldNodeSetID
return
}
if err = c.syncUpdateNodeSet(ns); err != nil {
return
}
err = c.t.putDataNode(dataNode)
return
}
/*if node report data partition infos,so range data partition infos,then update data partition info*/
func (c *Cluster) updateDataNode(dataNode *DataNode, dps []*proto.DataPartitionReport) {
for _, vr := range dps {
if vr == nil {
continue
}
if vr.VolName != "" {
vol, err := c.getVol(vr.VolName)
if err != nil {
continue
}
if vol.Status == proto.VolStatusMarkDelete {
continue
}
if dp, err := vol.getDataPartitionByID(vr.PartitionID); err == nil {
dp.updateMetric(vr, dataNode, c)
}
} else {
if dp, err := c.getDataPartitionByID(vr.PartitionID); err == nil {
dp.updateMetric(vr, dataNode, c)
}
}
}
}
func (c *Cluster) updateMetaNode(metaNode *MetaNode, metaPartitions []*proto.MetaPartitionReport, threshold bool) {
var (
vol *Vol
err error
)
for _, mr := range metaPartitions {
if mr == nil {
continue
}
var mp *MetaPartition
if mr.VolName != "" {
vol, err = c.getVol(mr.VolName)
if err != nil {
continue
}
if vol.Status == proto.VolStatusMarkDelete {
continue
}
mp, err = vol.metaPartition(mr.PartitionID)
if err != nil {
continue
}
} else {
mp, err = c.getMetaPartitionByID(mr.PartitionID)
if err != nil {
continue
}
}
// send latest end to replica metanode, including updating the end after MaxMP split when the old MaxMP is unavailable
if mr.End != mp.End {
mp.addUpdateMetaReplicaTask(c)
}
mp.updateMetaPartition(mr, metaNode)
vol.uidSpaceManager.volUidUpdate(mr)
vol.quotaManager.quotaUpdate(mr)
c.updateInodeIDUpperBound(mp, mr, threshold, metaNode)
}
}
func (c *Cluster) updateInodeIDUpperBound(mp *MetaPartition, mr *proto.MetaPartitionReport, hasArriveThreshold bool, metaNode *MetaNode) (err error) {
if !hasArriveThreshold {
return
}
var vol *Vol
if vol, err = c.getVol(mp.volName); err != nil {
log.LogWarnf("action[updateInodeIDRange] vol[%v] not found", mp.volName)
return
}
maxPartitionID := vol.maxPartitionID()
if mr.PartitionID < maxPartitionID {
return
}
var end uint64
metaPartitionInodeIdStep := gConfig.MetaPartitionInodeIdStep
if mr.MaxInodeID <= 0 {
end = mr.Start + metaPartitionInodeIdStep
} else {
end = mr.MaxInodeID + metaPartitionInodeIdStep
}
log.LogWarnf("mpId[%v],start[%v],end[%v],addr[%v],used[%v]", mp.PartitionID, mp.Start, mp.End, metaNode.Addr, metaNode.Used)
if c.cfg.DisableAutoCreate {
log.LogWarnf("updateInodeIDUpperBound: disable auto create meta partition, mp %d", mp.PartitionID)
return
}
if err = vol.splitMetaPartition(c, mp, end, metaPartitionInodeIdStep, false); err != nil {
log.LogError(err)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
syslog "log"
"strconv"
"strings"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
pt "github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
)
// config key
const (
colonSplit = ":"
commaSplit = ","
cfgPeers = "peers"
// if the data partition has not been reported within this interval (in terms of seconds), it will be considered as missing.
missingDataPartitionInterval = "missingDataPartitionInterval"
cfgDpNoLeaderReportIntervalSec = "dpNoLeaderReportIntervalSec"
cfgMpNoLeaderReportIntervalSec = "mpNoLeaderReportIntervalSec"
dataPartitionTimeOutSec = "dataPartitionTimeOutSec"
NumberOfDataPartitionsToLoad = "numberOfDataPartitionsToLoad"
secondsToFreeDataPartitionAfterLoad = "secondsToFreeDataPartitionAfterLoad"
nodeSetCapacity = "nodeSetCap"
cfgMetaNodeReservedMem = "metaNodeReservedMem"
heartbeatPortKey = "heartbeatPort"
replicaPortKey = "replicaPort"
faultDomain = "faultDomain"
cfgDomainBatchGrpCnt = "faultDomainGrpBatchCnt"
cfgDomainBuildAsPossible = "faultDomainBuildAsPossible"
cfgmetaPartitionInodeIdStep = "metaPartitionInodeIdStep"
cfgMaxQuotaNumPerVol = "maxQuotaNumPerVol"
disableAutoCreate = "disableAutoCreate"
cfgMonitorPushAddr = "monitorPushAddr"
intervalToScanS3Expiration = "intervalToScanS3Expiration"
cfgVolForceDeletion = "volForceDeletion"
cfgVolDeletionDentryThreshold = "volDeletionDentryThreshold"
)
// default value
const (
defaultTobeFreedDataPartitionCount = 1000
defaultSecondsToFreeDataPartitionAfterLoad = 5 * 60 // a data partition can only be freed after loading 5 mins
defaultIntervalToFreeDataPartition = 10 // in terms of seconds
defaultIntervalToCheck = 60
defaultIntervalToCheckHeartbeat = 6
defaultIntervalToCheckDataPartition = 5
defaultIntervalToCheckQos = 1
defaultIntervalToCheckCrc = 20 * defaultIntervalToCheck // in terms of seconds
noHeartBeatTimes = 3 // number of times that no heartbeat reported
defaultNodeTimeOutSec = noHeartBeatTimes * defaultIntervalToCheckHeartbeat
defaultDataPartitionTimeOutSec = 5 * defaultIntervalToCheckHeartbeat
defaultMissingDataPartitionInterval = 24 * 3600
defaultDpNoLeaderReportIntervalSec = 10 * 60
defaultMpNoLeaderReportIntervalSec = 5
defaultIntervalToAlarmMissingDataPartition = 60 * 60
timeToWaitForResponse = 120 // time to wait for response by the master during loading partition
defaultPeriodToLoadAllDataPartitions = 60 * 60 * 4 // how long we need to load all the data partitions on the master every time
defaultNumberOfDataPartitionsToLoad = 50 // how many data partitions to load every time
defaultMetaPartitionTimeOutSec = 10 * defaultIntervalToCheckHeartbeat
// DefaultMetaPartitionMissSec = 3600
defaultIntervalToAlarmMissingMetaPartition = 10 * 60 // interval of checking if a replica is missing
defaultMetaPartitionMemUsageThreshold float32 = 0.75 // memory usage threshold on a meta partition
defaultDomainUsageThreshold float64 = 0.90 // storage usage threshold on a data partition
defaultOverSoldFactor float32 = 0 // 0 means no oversold limit
defaultMaxMetaPartitionCountOnEachNode = 10000
defaultReplicaNum = 3
defaultDiffSpaceUsage = 1024 * 1024 * 1024
defaultDiffReplicaFileCount = 20
defaultNodeSetGrpStep = 1
defaultMasterMinQosAccept = 20000
defaultMaxDpCntLimit = 3000
defaultIntervalToScanS3Expiration = 12 * 3600
defaultMaxConcurrentLcNodes = 3
defaultIntervalToCheckDelVerTaskExpiration = 3
metaPartitionInodeUsageThreshold float64 = 0.75 // inode usage threshold on a meta partition
lowerLimitRWMetaPartition = 3 // lower limit of RW meta partition, equal defaultReplicaNum
)
// AddrDatabase is a map that stores the address of a given host (e.g., the leader)
var AddrDatabase = make(map[uint64]string)
type clusterConfig struct {
secondsToFreeDataPartitionAfterLoad int64
NodeTimeOutSec int64
MissingDataPartitionInterval int64
DpNoLeaderReportIntervalSec int64
MpNoLeaderReportIntervalSec int64
DataPartitionTimeOutSec int64
IntervalToAlarmMissingDataPartition int64
PeriodToLoadALLDataPartitions int64
metaNodeReservedMem uint64
IntervalToCheckDataPartition int // seconds
IntervalToCheckQos int // seconds
numberOfDataPartitionsToFree int
numberOfDataPartitionsToLoad int
nodeSetCapacity int
MetaNodeThreshold float32
ClusterLoadFactor float32
MetaNodeDeleteBatchCount uint64 // metanode delete batch count
DataNodeDeleteLimitRate uint64 // datanode delete limit rate
MetaNodeDeleteWorkerSleepMs uint64 // metaNode delete worker sleep time with millisecond. if 0 for no sleep
MaxDpCntLimit uint64 // datanode data partition limit
DataNodeAutoRepairLimitRate uint64 // datanode autorepair limit rate
DpMaxRepairErrCnt uint64
DpRepairTimeOut uint64
peers []raftstore.PeerAddress
peerAddrs []string
heartbeatPort int64
replicaPort int64
diffReplicaSpaceUsage uint64
diffReplicaFileCount uint32
faultDomain bool
DefaultNormalZoneCnt int
DomainBuildAsPossible bool
DataPartitionUsageThreshold float64
QosMasterAcceptLimit uint64
DirChildrenNumLimit uint32
MetaPartitionInodeIdStep uint64
MaxQuotaNumPerVol int
DisableAutoCreate bool
MonitorPushAddr string
IntervalToScanS3Expiration int64
MaxConcurrentLcNodes uint64
volForceDeletion bool // when delete a volume, ignore it's dentry count or not
volDeletionDentryThreshold uint64 // in case of volForceDeletion is set to false, define the dentry count threshold to allow volume deletion
}
func newClusterConfig() (cfg *clusterConfig) {
cfg = new(clusterConfig)
cfg.numberOfDataPartitionsToFree = defaultTobeFreedDataPartitionCount
cfg.secondsToFreeDataPartitionAfterLoad = defaultSecondsToFreeDataPartitionAfterLoad
cfg.NodeTimeOutSec = defaultNodeTimeOutSec
cfg.MissingDataPartitionInterval = defaultMissingDataPartitionInterval
cfg.DpNoLeaderReportIntervalSec = defaultDpNoLeaderReportIntervalSec
cfg.MpNoLeaderReportIntervalSec = defaultMpNoLeaderReportIntervalSec
cfg.DataPartitionTimeOutSec = defaultDataPartitionTimeOutSec
cfg.IntervalToCheckDataPartition = defaultIntervalToCheckDataPartition
cfg.IntervalToCheckQos = defaultIntervalToCheckQos
cfg.IntervalToAlarmMissingDataPartition = defaultIntervalToAlarmMissingDataPartition
cfg.numberOfDataPartitionsToLoad = defaultNumberOfDataPartitionsToLoad
cfg.PeriodToLoadALLDataPartitions = defaultPeriodToLoadAllDataPartitions
cfg.MetaNodeThreshold = defaultMetaPartitionMemUsageThreshold
cfg.ClusterLoadFactor = defaultOverSoldFactor
cfg.MaxDpCntLimit = defaultMaxDpCntLimit
cfg.metaNodeReservedMem = defaultMetaNodeReservedMem
cfg.diffReplicaSpaceUsage = defaultDiffSpaceUsage
cfg.diffReplicaFileCount = defaultDiffReplicaFileCount
cfg.QosMasterAcceptLimit = defaultMasterMinQosAccept
cfg.DirChildrenNumLimit = pt.DefaultDirChildrenNumLimit
cfg.MetaPartitionInodeIdStep = defaultMetaPartitionInodeIDStep
cfg.MaxQuotaNumPerVol = defaultMaxQuotaNumPerVol
cfg.IntervalToScanS3Expiration = defaultIntervalToScanS3Expiration
cfg.MaxConcurrentLcNodes = defaultMaxConcurrentLcNodes
return
}
func parsePeerAddr(peerAddr string) (id uint64, ip string, port uint64, err error) {
peerStr := strings.Split(peerAddr, colonSplit)
id, err = strconv.ParseUint(peerStr[0], 10, 64)
if err != nil {
return
}
port, err = strconv.ParseUint(peerStr[2], 10, 64)
if err != nil {
return
}
ip = peerStr[1]
return
}
func (cfg *clusterConfig) parsePeers(peerStr string) error {
peerArr := strings.Split(peerStr, commaSplit)
cfg.peerAddrs = peerArr
for _, peerAddr := range peerArr {
id, ip, port, err := parsePeerAddr(peerAddr)
if err != nil {
return err
}
cfg.peers = append(cfg.peers, raftstore.PeerAddress{Peer: proto.Peer{ID: id}, Address: ip, HeartbeatPort: int(cfg.heartbeatPort), ReplicaPort: int(cfg.replicaPort)})
address := fmt.Sprintf("%v:%v", ip, port)
syslog.Println(address)
AddrDatabase[id] = address
}
return nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/atomicutil"
"github.com/cubefs/cubefs/util/log"
)
// DataNode stores all the information about a data node
type DataNode struct {
Total uint64 `json:"TotalWeight"`
Used uint64 `json:"UsedWeight"`
AvailableSpace uint64
ID uint64
ZoneName string `json:"Zone"`
Addr string
DomainAddr string
ReportTime time.Time
StartTime int64
LastUpdateTime time.Time
isActive bool
sync.RWMutex `graphql:"-"`
UsageRatio float64 // used / total space
SelectedTimes uint64 // number times that this datanode has been selected as the location for a data partition.
TaskManager *AdminTaskManager `graphql:"-"`
DataPartitionReports []*proto.DataPartitionReport
DataPartitionCount uint32
TotalPartitionSize uint64
NodeSetID uint64
PersistenceDataPartitions []uint64
BadDisks []string // Keep this old field for compatibility
BadDiskStats []proto.BadDiskStat // key: disk path
DecommissionedDisks sync.Map
ToBeOffline bool
RdOnly bool
MigrateLock sync.RWMutex
QosIopsRLimit uint64
QosIopsWLimit uint64
QosFlowRLimit uint64
QosFlowWLimit uint64
DecommissionStatus uint32
DecommissionDstAddr string
DecommissionRaftForce bool
DecommissionRetry uint8
DecommissionLimit int
DecommissionCompleteTime int64
DpCntLimit DpCountLimiter `json:"-"` // max count of data partition in a data node
CpuUtil atomicutil.Float64 `json:"-"`
ioUtils atomic.Value `json:"-"`
DecommissionDiskList []string
DecommissionDpTotal int
}
func newDataNode(addr, zoneName, clusterID string) (dataNode *DataNode) {
dataNode = new(DataNode)
dataNode.Total = 1
dataNode.Addr = addr
dataNode.ZoneName = zoneName
dataNode.LastUpdateTime = time.Now().Add(-time.Minute)
dataNode.TaskManager = newAdminTaskManager(dataNode.Addr, clusterID)
dataNode.DecommissionStatus = DecommissionInitial
dataNode.DpCntLimit = newDpCountLimiter(nil)
dataNode.CpuUtil.Store(0)
dataNode.SetIoUtils(make(map[string]float64))
return
}
func (dataNode *DataNode) GetIoUtils() map[string]float64 {
return dataNode.ioUtils.Load().(map[string]float64)
}
func (dataNode *DataNode) SetIoUtils(used map[string]float64) {
dataNode.ioUtils.Store(used)
}
func (dataNode *DataNode) checkLiveness() {
dataNode.Lock()
defer dataNode.Unlock()
log.LogInfof("action[checkLiveness] datanode[%v] report time[%v],since report time[%v], need gap [%v]",
dataNode.Addr, dataNode.ReportTime, time.Since(dataNode.ReportTime), time.Second*time.Duration(defaultNodeTimeOutSec))
if time.Since(dataNode.ReportTime) > time.Second*time.Duration(defaultNodeTimeOutSec) {
dataNode.isActive = false
}
return
}
func (dataNode *DataNode) badPartitions(diskPath string, c *Cluster) (partitions []*DataPartition) {
partitions = make([]*DataPartition, 0)
vols := c.copyVols()
if len(vols) == 0 {
return partitions
}
for _, vol := range vols {
dps := vol.dataPartitions.checkBadDiskDataPartitions(diskPath, dataNode.Addr)
partitions = append(partitions, dps...)
}
return
}
func (dataNode *DataNode) getDisks(c *Cluster) (diskPaths []string) {
diskPaths = make([]string, 0)
vols := c.copyVols()
if len(vols) == 0 {
return diskPaths
}
for _, vol := range vols {
disks := vol.dataPartitions.getReplicaDiskPaths(dataNode.Addr)
for _, disk := range disks {
if inStingList(disk, diskPaths) {
continue
}
diskPaths = append(diskPaths, disk)
}
}
return
}
func (dataNode *DataNode) updateNodeMetric(resp *proto.DataNodeHeartbeatResponse) {
dataNode.Lock()
defer dataNode.Unlock()
dataNode.DomainAddr = util.ParseIpAddrToDomainAddr(dataNode.Addr)
dataNode.Total = resp.Total
dataNode.Used = resp.Used
if dataNode.AvailableSpace > resp.Available ||
time.Since(dataNode.LastUpdateTime) > defaultNodeTimeOutSec*time.Second {
dataNode.AvailableSpace = resp.Available
dataNode.LastUpdateTime = time.Now()
}
dataNode.ZoneName = resp.ZoneName
dataNode.DataPartitionCount = resp.CreatedPartitionCnt
dataNode.DataPartitionReports = resp.PartitionReports
dataNode.TotalPartitionSize = resp.TotalPartitionSize
dataNode.BadDisks = resp.BadDisks
dataNode.BadDiskStats = resp.BadDiskStats
dataNode.StartTime = resp.StartTime
if dataNode.Total == 0 {
dataNode.UsageRatio = 0.0
} else {
dataNode.UsageRatio = (float64)(dataNode.Used) / (float64)(dataNode.Total)
}
dataNode.ReportTime = time.Now()
dataNode.isActive = true
log.LogDebugf("updateNodeMetric. datanode id %v addr %v total %v used %v avaliable %v", dataNode.ID, dataNode.Addr,
dataNode.Total, dataNode.Used, dataNode.AvailableSpace)
}
func (dataNode *DataNode) canAlloc() bool {
dataNode.RLock()
defer dataNode.RUnlock()
if !overSoldLimit() {
return true
}
maxCapacity := overSoldCap(dataNode.Total)
if maxCapacity < dataNode.TotalPartitionSize {
return false
}
return true
}
func (dataNode *DataNode) isWriteAble() (ok bool) {
dataNode.RLock()
defer dataNode.RUnlock()
if dataNode.isActive && dataNode.AvailableSpace > 10*util.GB && !dataNode.RdOnly {
ok = true
}
return
}
func (dataNode *DataNode) canAllocDp() bool {
if !dataNode.isWriteAble() {
return false
}
if dataNode.ToBeOffline {
log.LogWarnf("action[canAllocDp] dataNode [%v] is offline ", dataNode.Addr)
return false
}
if !dataNode.dpCntInLimit() {
return false
}
return true
}
func (dataNode *DataNode) GetDpCntLimit() uint32 {
return uint32(dataNode.DpCntLimit.GetCntLimit())
}
func (dataNode *DataNode) dpCntInLimit() bool {
return dataNode.DataPartitionCount <= dataNode.GetDpCntLimit()
}
func (dataNode *DataNode) isWriteAbleWithSize(size uint64) (ok bool) {
dataNode.RLock()
defer dataNode.RUnlock()
if dataNode.isActive == true && dataNode.AvailableSpace > size {
ok = true
}
return
}
func (dataNode *DataNode) GetID() uint64 {
dataNode.RLock()
defer dataNode.RUnlock()
return dataNode.ID
}
func (dataNode *DataNode) GetAddr() string {
dataNode.RLock()
defer dataNode.RUnlock()
return dataNode.Addr
}
// SelectNodeForWrite implements "SelectNodeForWrite" in the Node interface
func (dataNode *DataNode) SelectNodeForWrite() {
dataNode.Lock()
defer dataNode.Unlock()
dataNode.UsageRatio = float64(dataNode.Used) / float64(dataNode.Total)
dataNode.SelectedTimes++
}
func (dataNode *DataNode) clean() {
dataNode.TaskManager.exitCh <- struct{}{}
}
func (dataNode *DataNode) createHeartbeatTask(masterAddr string, enableDiskQos bool) (task *proto.AdminTask) {
request := &proto.HeartBeatRequest{
CurrTime: time.Now().Unix(),
MasterAddr: masterAddr,
}
request.EnableDiskQos = enableDiskQos
request.QosIopsReadLimit = dataNode.QosIopsRLimit
request.QosIopsWriteLimit = dataNode.QosIopsWLimit
request.QosFlowReadLimit = dataNode.QosFlowRLimit
request.QosFlowWriteLimit = dataNode.QosFlowWLimit
request.DecommissionDisks = dataNode.getDecommissionedDisks()
task = proto.NewAdminTask(proto.OpDataNodeHeartbeat, dataNode.Addr, request)
return
}
func (dataNode *DataNode) addDecommissionedDisk(diskPath string) (exist bool) {
_, exist = dataNode.DecommissionedDisks.LoadOrStore(diskPath, struct{}{})
log.LogInfof("action[addDecommissionedDisk] finish, exist[%v], decommissioned disk[%v], dataNode[%v]", exist, diskPath, dataNode.Addr)
return
}
func (dataNode *DataNode) deleteDecommissionedDisk(diskPath string) (exist bool) {
_, exist = dataNode.DecommissionedDisks.LoadAndDelete(diskPath)
log.LogInfof("action[deleteDecommissionedDisk] finish, exist[%v], decommissioned disk[%v], dataNode[%v]", exist, diskPath, dataNode.Addr)
return
}
func (dataNode *DataNode) getDecommissionedDisks() (decommissionedDisks []string) {
dataNode.DecommissionedDisks.Range(func(key, value interface{}) bool {
if diskPath, ok := key.(string); ok {
decommissionedDisks = append(decommissionedDisks, diskPath)
}
return true
})
return
}
func (dataNode *DataNode) updateDecommissionStatus(c *Cluster, debug bool) (uint32, float64) {
var (
partitionIds []uint64
failedPartitionIds []uint64
runningPartitionIds []uint64
preparePartitionIds []uint64
stopPartitionIds []uint64
totalDisk = len(dataNode.DecommissionDiskList)
markDiskNum = 0
successDiskNum = 0
progress float64
)
if dataNode.GetDecommissionStatus() == DecommissionInitial {
return DecommissionInitial, float64(0)
}
if dataNode.GetDecommissionStatus() == markDecommission {
return markDecommission, float64(0)
}
if dataNode.GetDecommissionStatus() == DecommissionSuccess {
return DecommissionSuccess, float64(1)
}
if dataNode.GetDecommissionStatus() == DecommissionPause {
return DecommissionPause, float64(0)
}
defer func() {
c.syncUpdateDataNode(dataNode)
}()
// not enter running status
if dataNode.DecommissionRetry >= defaultDecommissionRetryLimit {
dataNode.markDecommissionFail()
return DecommissionFail, float64(0)
}
log.LogDebugf("action[GetLatestDecommissionDataPartition]dataNode %v diskList %v",
dataNode.Addr, dataNode.DecommissionDiskList)
if totalDisk == 0 {
dataNode.SetDecommissionStatus(DecommissionInitial)
return DecommissionInitial, float64(0)
}
for _, disk := range dataNode.DecommissionDiskList {
key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
// if not found, may already success, so only care running disk
if value, ok := c.DecommissionDisks.Load(key); ok {
dd := value.(*DecommissionDisk)
status := dd.GetDecommissionStatus()
if status == DecommissionSuccess {
successDiskNum++
} else if status == markDecommission {
markDiskNum++
}
_, diskProgress := dd.updateDecommissionStatus(c, debug)
progress += diskProgress
} else {
successDiskNum++ // disk with DecommissionSuccess will be removed from cache
progress += float64(1)
}
}
// only care data node running/prepare/success
// no disk get token
if markDiskNum == totalDisk {
dataNode.SetDecommissionStatus(DecommissionPrepare)
return DecommissionPrepare, float64(0)
} else {
if successDiskNum == totalDisk {
dataNode.SetDecommissionStatus(DecommissionSuccess)
return DecommissionSuccess, float64(1)
}
}
// update datanode or running status
partitions := dataNode.GetLatestDecommissionDataPartition(c)
// Get all dp on this dataNode
failedNum := 0
runningNum := 0
prepareNum := 0
stopNum := 0
for _, dp := range partitions {
if dp.IsDecommissionFailed() {
failedNum++
failedPartitionIds = append(failedPartitionIds, dp.PartitionID)
}
if dp.GetDecommissionStatus() == DecommissionRunning {
runningNum++
runningPartitionIds = append(runningPartitionIds, dp.PartitionID)
}
if dp.GetDecommissionStatus() == DecommissionPrepare {
prepareNum++
preparePartitionIds = append(preparePartitionIds, dp.PartitionID)
}
// datanode may stop before and will be counted into partitions
if dp.GetDecommissionStatus() == DecommissionPause {
stopNum++
stopPartitionIds = append(stopPartitionIds, dp.PartitionID)
}
partitionIds = append(partitionIds, dp.PartitionID)
}
progress = progress / float64(totalDisk)
if failedNum >= (len(partitions)-stopNum) && failedNum != 0 {
dataNode.markDecommissionFail()
return DecommissionFail, progress
}
dataNode.SetDecommissionStatus(DecommissionRunning)
if debug {
log.LogInfof("action[updateDecommissionStatus] dataNode[%v] progress[%v] totalNum[%v] "+
"partitionIds %v FailedNum[%v] failedPartitionIds %v, runningNum[%v] runningDp %v, prepareNum[%v] prepareDp %v "+
"stopNum[%v] stopPartitionIds %v ",
dataNode.Addr, progress, len(partitions), partitionIds, failedNum, failedPartitionIds, runningNum, runningPartitionIds,
prepareNum, preparePartitionIds, stopNum, stopPartitionIds)
}
return DecommissionRunning, progress
}
func (dataNode *DataNode) GetLatestDecommissionDataPartition(c *Cluster) (partitions []*DataPartition) {
log.LogDebugf("action[GetLatestDecommissionDataPartition]dataNode %v diskList %v", dataNode.Addr, dataNode.DecommissionDiskList)
for _, disk := range dataNode.DecommissionDiskList {
key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
// if not found, may already success, so only care running disk
if value, ok := c.DecommissionDisks.Load(key); ok {
dd := value.(*DecommissionDisk)
dps := c.getAllDecommissionDataPartitionByDiskAndTerm(dd.SrcAddr, dd.DiskPath, dd.DecommissionTerm)
partitions = append(partitions, dps...)
dpIds := make([]uint64, 0)
for _, dp := range dps {
dpIds = append(dpIds, dp.PartitionID)
}
log.LogDebugf("action[GetLatestDecommissionDataPartition]dataNode %v disk %v dps[%v]",
dataNode.Addr, dd.DiskPath, dpIds)
}
}
return
}
func (dataNode *DataNode) GetDecommissionStatus() uint32 {
return atomic.LoadUint32(&dataNode.DecommissionStatus)
}
func (dataNode *DataNode) SetDecommissionStatus(status uint32) {
atomic.StoreUint32(&dataNode.DecommissionStatus, status)
}
func (dataNode *DataNode) GetDecommissionFailedDPByTerm(c *Cluster) (error, []uint64) {
var (
failedDps []uint64
err error
)
if dataNode.GetDecommissionStatus() != DecommissionFail {
err = fmt.Errorf("action[GetDecommissionDataNodeFailedDP]dataNode[%s] status must be failed,but[%d]",
dataNode.Addr, dataNode.GetDecommissionStatus())
return err, failedDps
}
partitions := dataNode.GetLatestDecommissionDataPartition(c)
log.LogDebugf("action[GetDecommissionDataNodeFailedDP] partitions len %v", len(partitions))
for _, dp := range partitions {
if dp.IsDecommissionFailed() {
failedDps = append(failedDps, dp.PartitionID)
log.LogWarnf("action[GetDecommissionDataNodeFailedDP] dp[%v] failed", dp.PartitionID)
}
}
log.LogWarnf("action[GetDecommissionDataNodeFailedDP] failed dp list [%v]", failedDps)
return nil, failedDps
}
func (dataNode *DataNode) GetDecommissionFailedDP(c *Cluster) (error, []uint64) {
var (
failedDps []uint64
err error
)
if dataNode.GetDecommissionStatus() != DecommissionFail {
err = fmt.Errorf("action[GetDecommissionDataNodeFailedDP]dataNode[%s] status must be failed,but[%d]",
dataNode.Addr, dataNode.GetDecommissionStatus())
return err, failedDps
}
partitions := c.getAllDecommissionDataPartitionByDataNode(dataNode.Addr)
log.LogDebugf("action[GetDecommissionDataNodeFailedDP] partitions len %v", len(partitions))
for _, dp := range partitions {
if dp.IsDecommissionFailed() {
failedDps = append(failedDps, dp.PartitionID)
log.LogWarnf("action[GetDecommissionDataNodeFailedDP] dp[%v] failed", dp.PartitionID)
}
}
log.LogWarnf("action[GetDecommissionDataNodeFailedDP] failed dp list [%v]", failedDps)
return nil, failedDps
}
func (dataNode *DataNode) markDecommission(targetAddr string, raftForce bool, limit int) {
dataNode.SetDecommissionStatus(markDecommission)
dataNode.DecommissionRaftForce = raftForce
dataNode.DecommissionDstAddr = targetAddr
// reset decommission status for failed once
dataNode.DecommissionRetry = 0
dataNode.DecommissionLimit = limit
dataNode.DecommissionDiskList = make([]string, 0)
}
func (dataNode *DataNode) canMarkDecommission() bool {
status := dataNode.GetDecommissionStatus()
return status == DecommissionInitial || status == DecommissionPause || status == DecommissionFail
}
func (dataNode *DataNode) markDecommissionSuccess(c *Cluster) {
dataNode.SetDecommissionStatus(DecommissionSuccess)
partitions := c.getAllDataPartitionByDataNode(dataNode.Addr)
// if only decommission part of data partitions, can alloc dp in future
if len(partitions) != 0 {
dataNode.ToBeOffline = false
}
dataNode.DecommissionCompleteTime = time.Now().Unix()
}
func (dataNode *DataNode) markDecommissionFail() {
dataNode.SetDecommissionStatus(DecommissionFail)
// dataNode.ToBeOffline = false
// dataNode.DecommissionCompleteTime = time.Now().Unix()
}
func (dataNode *DataNode) resetDecommissionStatus() {
dataNode.SetDecommissionStatus(DecommissionInitial)
dataNode.DecommissionRaftForce = false
dataNode.DecommissionDstAddr = ""
dataNode.DecommissionRetry = 0
dataNode.DecommissionLimit = 0
dataNode.DecommissionCompleteTime = 0
dataNode.DecommissionDiskList = make([]string, 0)
}
func (dataNode *DataNode) createVersionTask(volume string, version uint64, op uint8, addr string, verList []*proto.VolVersionInfo) (task *proto.AdminTask) {
request := &proto.MultiVersionOpRequest{
VolumeID: volume,
VerSeq: version,
Op: uint8(op),
Addr: addr,
VolVerList: verList,
}
log.LogInfof("action[createVersionTask] op %v datanode addr %v addr %v volume %v seq %v", op, dataNode.Addr, addr, volume, version)
task = proto.NewAdminTask(proto.OpVersionOperation, dataNode.Addr, request)
return
}
func (dataNode *DataNode) CanBePaused() bool {
status := dataNode.GetDecommissionStatus()
if status == DecommissionRunning || status == markDecommission || status == DecommissionPause {
return true
}
return false
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// DataPartition represents the structure of storing the file contents.
type DataPartition struct {
PartitionID uint64
PartitionType int
PartitionTTL int64
LastLoadedTime int64
ReplicaNum uint8
Status int8
isRecover bool
Replicas []*DataReplica
LeaderReportTime int64
Hosts []string // host addresses
Peers []proto.Peer
offlineMutex sync.RWMutex
sync.RWMutex
total uint64
used uint64
MissingNodes map[string]int64 // key: address of the missing node, value: when the node is missing
VolName string
VolID uint64
modifyTime int64
createTime int64
lastWarnTime int64
OfflinePeerID uint64
FileInCoreMap map[string]*FileInCore
FilesWithMissingReplica map[string]int64 // key: file name, value: last time when a missing replica is found
RdOnly bool
addReplicaMutex sync.RWMutex
DecommissionRetry int
DecommissionStatus uint32
DecommissionSrcAddr string
DecommissionDstAddr string
DecommissionRaftForce bool
DecommissionSrcDiskPath string
DecommissionTerm uint64
DecommissionDstAddrSpecify bool // if DecommissionDstAddrSpecify is true, donot rollback when add replica fail
DecommissionNeedRollback bool
DecommissionNeedRollbackTimes int
SpecialReplicaDecommissionStop chan bool // used for stop
SpecialReplicaDecommissionStep uint32
IsDiscard bool
VerSeq uint64
RecoverStartTime time.Time
RecoverLastConsumeTime time.Duration
DecommissionWaitTimes int
}
type DataPartitionPreLoad struct {
PreloadCacheTTL uint64
preloadCacheCapacity int
preloadReplicaNum int
preloadZoneName string
}
func (d *DataPartitionPreLoad) toString() string {
return fmt.Sprintf("PreloadCacheTTL[%d]_preloadCacheCapacity[%d]_preloadReplicaNum[%d]_preloadZoneName[%s]",
d.PreloadCacheTTL, d.preloadCacheCapacity, d.preloadReplicaNum, d.preloadZoneName)
}
func newDataPartition(ID uint64, replicaNum uint8, volName string, volID uint64, partitionType int, partitionTTL int64) (partition *DataPartition) {
partition = new(DataPartition)
partition.ReplicaNum = replicaNum
partition.PartitionID = ID
partition.Hosts = make([]string, 0)
partition.Peers = make([]proto.Peer, 0)
partition.Replicas = make([]*DataReplica, 0)
partition.FileInCoreMap = make(map[string]*FileInCore, 0)
partition.FilesWithMissingReplica = make(map[string]int64)
partition.MissingNodes = make(map[string]int64)
partition.Status = proto.ReadOnly
partition.VolName = volName
partition.VolID = volID
partition.PartitionType = partitionType
partition.PartitionTTL = partitionTTL
now := time.Now().Unix()
partition.modifyTime = now
partition.createTime = now
partition.lastWarnTime = now
partition.SpecialReplicaDecommissionStop = make(chan bool, 1024)
partition.DecommissionStatus = DecommissionInitial
partition.SpecialReplicaDecommissionStep = SpecialDecommissionInitial
partition.DecommissionDstAddrSpecify = false
partition.LeaderReportTime = now
return
}
func (partition *DataPartition) setReadWrite() {
partition.Status = proto.ReadWrite
for _, replica := range partition.Replicas {
replica.Status = proto.ReadWrite
}
}
func (partition *DataPartition) isSpecialReplicaCnt() bool {
return partition.ReplicaNum == 1 || partition.ReplicaNum == 2
}
func (partition *DataPartition) isSingleReplica() bool {
return partition.ReplicaNum == 1
}
func (partition *DataPartition) isTwoReplica() bool {
return partition.ReplicaNum == 2
}
func (partition *DataPartition) resetFilesWithMissingReplica() {
partition.Lock()
defer partition.Unlock()
partition.FilesWithMissingReplica = make(map[string]int64)
}
func (partition *DataPartition) dataNodeStartTime() int64 {
partition.Lock()
defer partition.Unlock()
startTime := int64(0)
for _, replica := range partition.Replicas {
if startTime < replica.dataNode.StartTime {
startTime = replica.dataNode.StartTime
}
}
return startTime
}
func (partition *DataPartition) addReplica(replica *DataReplica) {
for _, r := range partition.Replicas {
if replica.Addr == r.Addr {
return
}
}
partition.Replicas = append(partition.Replicas, replica)
}
func (partition *DataPartition) tryToChangeLeaderByHost(host string) (err error) {
var dataNode *DataNode
for _, r := range partition.Replicas {
if host == r.Addr {
dataNode = r.dataNode
break
}
}
if dataNode == nil {
return fmt.Errorf("host not found[%v]", host)
}
task, err := partition.createTaskToTryToChangeLeader(host)
if err != nil {
return
}
if _, err = dataNode.TaskManager.syncSendAdminTask(task); err != nil {
return
}
return
}
func (partition *DataPartition) tryToChangeLeader(c *Cluster, dataNode *DataNode) (err error) {
task, err := partition.createTaskToTryToChangeLeader(dataNode.Addr)
if err != nil {
return
}
if _, err = dataNode.TaskManager.syncSendAdminTask(task); err != nil {
return
}
return
}
func (partition *DataPartition) prepareAddRaftMember(addPeer proto.Peer) (leaderAddr string, candidateAddrs []string, err error) {
partition.RLock()
defer partition.RUnlock()
if contains(partition.Hosts, addPeer.Addr) {
err = fmt.Errorf("vol[%v],data partition[%v] has contains host[%v]", partition.VolName, partition.PartitionID, addPeer.Addr)
return
}
candidateAddrs = make([]string, 0, len(partition.Hosts))
leaderAddr = partition.getLeaderAddr()
if leaderAddr != "" && contains(partition.Hosts, leaderAddr) {
candidateAddrs = append(candidateAddrs, leaderAddr)
} else {
leaderAddr = ""
}
for _, host := range partition.Hosts {
if host == leaderAddr {
continue
}
candidateAddrs = append(candidateAddrs, host)
}
return
}
func (partition *DataPartition) createTaskToTryToChangeLeader(addr string) (task *proto.AdminTask, err error) {
task = proto.NewAdminTask(proto.OpDataPartitionTryToLeader, addr, nil)
partition.resetTaskID(task)
return
}
func (partition *DataPartition) createTaskToAddRaftMember(addPeer proto.Peer, leaderAddr string) (task *proto.AdminTask, err error) {
task = proto.NewAdminTask(proto.OpAddDataPartitionRaftMember, leaderAddr, newAddDataPartitionRaftMemberRequest(partition.PartitionID, addPeer))
partition.resetTaskID(task)
return
}
func (partition *DataPartition) createTaskToRemoveRaftMember(c *Cluster, removePeer proto.Peer, force bool) (err error) {
doWork := func(leaderAddr string) error {
log.LogInfof("action[createTaskToRemoveRaftMember] vol[%v],data partition[%v] removePeer %v leaderAddr %v", partition.VolName, partition.PartitionID, removePeer, leaderAddr)
req := newRemoveDataPartitionRaftMemberRequest(partition.PartitionID, removePeer)
req.Force = force
task := proto.NewAdminTask(proto.OpRemoveDataPartitionRaftMember, leaderAddr, req)
partition.resetTaskID(task)
leaderDataNode, err := c.dataNode(leaderAddr)
if err != nil {
log.LogErrorf("action[createTaskToRemoveRaftMember] vol[%v],data partition[%v],err[%v]", partition.VolName, partition.PartitionID, err)
return err
}
if _, err = leaderDataNode.TaskManager.syncSendAdminTask(task); err != nil {
log.LogErrorf("action[createTaskToRemoveRaftMember] vol[%v],data partition[%v],err[%v]", partition.VolName, partition.PartitionID, err)
return err
}
return nil
}
leaderAddr := partition.getLeaderAddr()
if leaderAddr == "" {
if force {
for _, replica := range partition.Replicas {
if replica.Addr != removePeer.Addr {
leaderAddr = replica.Addr
}
doWork(leaderAddr)
}
} else {
err = proto.ErrNoLeader
return
}
} else {
return doWork(leaderAddr)
}
return
}
func (partition *DataPartition) createTaskToCreateDataPartition(addr string, dataPartitionSize uint64,
peers []proto.Peer, hosts []string, createType int, partitionType int, decommissionedDisks []string) (task *proto.AdminTask,
) {
leaderSize := 0
if createType == proto.DecommissionedCreateDataPartition {
leaderSize = int(partition.Replicas[0].Used)
}
task = proto.NewAdminTask(proto.OpCreateDataPartition, addr, newCreateDataPartitionRequest(
partition.VolName, partition.PartitionID, int(partition.ReplicaNum),
peers, int(dataPartitionSize), leaderSize, hosts, createType,
partitionType, decommissionedDisks, partition.VerSeq))
partition.resetTaskID(task)
return
}
func (partition *DataPartition) createTaskToDeleteDataPartition(addr string) (task *proto.AdminTask) {
task = proto.NewAdminTask(proto.OpDeleteDataPartition, addr, newDeleteDataPartitionRequest(partition.PartitionID))
partition.resetTaskID(task)
return
}
func (partition *DataPartition) resetTaskID(t *proto.AdminTask) {
t.ID = fmt.Sprintf("%v_DataPartitionID[%v]", t.ID, partition.PartitionID)
t.PartitionID = partition.PartitionID
}
// Check if there is a replica missing or not.
func (partition *DataPartition) hasMissingOneReplica(addr string, replicaNum int) (err error) {
hostNum := len(partition.Replicas)
inReplicas := false
for _, rep := range partition.Replicas {
if addr == rep.Addr {
inReplicas = true
}
}
if hostNum <= replicaNum-1 && inReplicas {
log.LogError(fmt.Sprintf("action[%v],partitionID:%v,err:%v",
"hasMissingOneReplica", partition.PartitionID, proto.ErrHasOneMissingReplica))
err = proto.ErrHasOneMissingReplica
}
return
}
func (partition *DataPartition) canBeOffLine(offlineAddr string) (err error) {
msg := fmt.Sprintf("action[canOffLine],partitionID:%v RocksDBHost:%v offLine:%v ",
partition.PartitionID, partition.Hosts, offlineAddr)
liveReplicas := partition.liveReplicas(defaultDataPartitionTimeOutSec)
otherLiveReplicas := make([]*DataReplica, 0)
for i := 0; i < len(liveReplicas); i++ {
replica := liveReplicas[i]
if replica.Addr != offlineAddr {
otherLiveReplicas = append(otherLiveReplicas, replica)
}
}
if partition.ReplicaNum >= 3 && len(otherLiveReplicas) < int(partition.ReplicaNum/2+1) {
var lives []string
for _, replica := range otherLiveReplicas {
lives = append(lives, replica.Addr)
}
msg = fmt.Sprintf(msg+" err:%v liveReplicas len:%v [%v] not satisify qurom %d ",
proto.ErrCannotBeOffLine, len(liveReplicas), lives, int(partition.ReplicaNum/2+1))
log.LogError(msg)
err = fmt.Errorf(msg)
return
}
if len(liveReplicas) == 0 {
msg = fmt.Sprintf(msg+" err:%v replicaNum:%v liveReplicas is 0 ", proto.ErrCannotBeOffLine, partition.ReplicaNum)
log.LogError(msg)
err = fmt.Errorf(msg)
return
}
return
}
// get all the valid replicas of the given data partition
func (partition *DataPartition) availableDataReplicas() (replicas []*DataReplica) {
replicas = make([]*DataReplica, 0)
for i := 0; i < len(partition.Replicas); i++ {
replica := partition.Replicas[i]
// the node reports heartbeat normally and the node is available
if replica.isLocationAvailable() == true && partition.hasHost(replica.Addr) == true {
replicas = append(replicas, replica)
}
}
return
}
// Remove the replica address from the memory.
func (partition *DataPartition) removeReplicaByAddr(addr string) {
delIndex := -1
var replica *DataReplica
for i := 0; i < len(partition.Replicas); i++ {
replica = partition.Replicas[i]
if replica.Addr == addr {
delIndex = i
break
}
}
msg := fmt.Sprintf("action[removeReplicaByAddr],data partition:%v on node:%v OffLine,the node is in replicas:%v", partition.PartitionID, addr, replica != nil)
log.LogDebug(msg)
if delIndex == -1 {
return
}
partition.FileInCoreMap = make(map[string]*FileInCore, 0)
partition.deleteReplicaByIndex(delIndex)
partition.modifyTime = time.Now().Unix()
return
}
func (partition *DataPartition) deleteReplicaByIndex(index int) {
var replicaAddrs []string
for _, replica := range partition.Replicas {
replicaAddrs = append(replicaAddrs, replica.Addr)
}
msg := fmt.Sprintf("deleteReplicaByIndex dp %v index:%v locations :%v ", partition.PartitionID, index, replicaAddrs)
log.LogInfo(msg)
replicasAfter := partition.Replicas[index+1:]
partition.Replicas = partition.Replicas[:index]
partition.Replicas = append(partition.Replicas, replicasAfter...)
}
func (partition *DataPartition) createLoadTasks() (tasks []*proto.AdminTask) {
partition.Lock()
defer partition.Unlock()
for _, addr := range partition.Hosts {
replica, err := partition.getReplica(addr)
if err != nil || replica.isLive(defaultDataPartitionTimeOutSec) == false {
continue
}
replica.HasLoadResponse = false
tasks = append(tasks, partition.createLoadTask(addr))
}
partition.LastLoadedTime = time.Now().Unix()
return
}
func (partition *DataPartition) createLoadTask(addr string) (task *proto.AdminTask) {
task = proto.NewAdminTask(proto.OpLoadDataPartition, addr, newLoadDataPartitionMetricRequest(partition.PartitionID))
partition.resetTaskID(task)
return
}
func (partition *DataPartition) getReplica(addr string) (replica *DataReplica, err error) {
for index := 0; index < len(partition.Replicas); index++ {
replica = partition.Replicas[index]
if replica.Addr == addr {
return
}
}
log.LogErrorf("action[getReplica],partitionID:%v,locations:%v,err:%v",
partition.PartitionID, addr, dataReplicaNotFound(addr))
return nil, errors.Trace(dataReplicaNotFound(addr), "%v not found", addr)
}
func (partition *DataPartition) convertToDataPartitionResponse() (dpr *proto.DataPartitionResponse) {
dpr = new(proto.DataPartitionResponse)
partition.Lock()
defer partition.Unlock()
dpr.PartitionID = partition.PartitionID
dpr.PartitionType = partition.PartitionType
dpr.PartitionTTL = partition.PartitionTTL
dpr.Status = partition.Status
dpr.ReplicaNum = partition.ReplicaNum
dpr.Hosts = make([]string, len(partition.Hosts))
copy(dpr.Hosts, partition.Hosts)
dpr.LeaderAddr = partition.getLeaderAddr()
dpr.IsRecover = partition.isRecover
dpr.IsDiscard = partition.IsDiscard
return
}
func (partition *DataPartition) getLeaderAddr() (leaderAddr string) {
for _, replica := range partition.Replicas {
if replica.IsLeader {
return replica.Addr
}
}
return
}
func (partition *DataPartition) getLeaderAddrWithLock() (leaderAddr string) {
partition.RLock()
defer partition.RUnlock()
for _, replica := range partition.Replicas {
if replica.IsLeader {
return replica.Addr
}
}
return
}
func (partition *DataPartition) checkLoadResponse(timeOutSec int64) (isResponse bool) {
partition.RLock()
defer partition.RUnlock()
for _, addr := range partition.Hosts {
replica, err := partition.getReplica(addr)
if err != nil {
log.LogInfof("action[checkLoadResponse] partitionID:%v getReplica addr %v error %v", partition.PartitionID, addr, err)
return
}
timePassed := time.Now().Unix() - partition.LastLoadedTime
if replica.HasLoadResponse == false && timePassed > timeToWaitForResponse {
msg := fmt.Sprintf("action[checkLoadResponse], partitionID:%v on node:%v no response, spent time %v s",
partition.PartitionID, addr, timePassed)
log.LogWarn(msg)
return
}
if replica.isLive(timeOutSec) == false || replica.HasLoadResponse == false {
log.LogInfof("action[checkLoadResponse] partitionID:%v getReplica addr %v replica.isLive(timeOutSec) %v", partition.PartitionID, addr, replica.isLive(timeOutSec))
return
}
}
isResponse = true
return
}
func (partition *DataPartition) getReplicaByIndex(index uint8) (replica *DataReplica) {
return partition.Replicas[int(index)]
}
func (partition *DataPartition) getFileCount() {
filesToBeDeleted := make([]string, 0)
partition.Lock()
defer partition.Unlock()
for _, replica := range partition.Replicas {
replica.FileCount = 0
}
for _, fc := range partition.FileInCoreMap {
if len(fc.MetadataArray) == 0 {
filesToBeDeleted = append(filesToBeDeleted, fc.Name)
}
for _, vfNode := range fc.MetadataArray {
replica := partition.getReplicaByIndex(vfNode.locIndex)
replica.FileCount++
}
}
for _, vfName := range filesToBeDeleted {
delete(partition.FileInCoreMap, vfName)
}
}
// Release the memory occupied by the data partition.
func (partition *DataPartition) releaseDataPartition() {
partition.Lock()
defer partition.Unlock()
liveReplicas := partition.getLiveReplicasFromHosts(defaultDataPartitionTimeOutSec)
for _, replica := range liveReplicas {
replica.HasLoadResponse = false
}
for name, fc := range partition.FileInCoreMap {
fc.MetadataArray = nil
delete(partition.FileInCoreMap, name)
}
partition.FileInCoreMap = make(map[string]*FileInCore, 0)
for name, fileMissReplicaTime := range partition.FilesWithMissingReplica {
if time.Now().Unix()-fileMissReplicaTime > 2*intervalToLoadDataPartition {
delete(partition.FilesWithMissingReplica, name)
}
}
}
func (partition *DataPartition) hasReplica(host string) (replica *DataReplica, ok bool) {
// using loop instead of map to save the memory
for _, replica = range partition.Replicas {
if replica.Addr == host {
ok = true
break
}
}
return
}
func (partition *DataPartition) checkReplicaNum(c *Cluster, vol *Vol) {
partition.RLock()
defer partition.RUnlock()
if int(partition.ReplicaNum) != len(partition.Hosts) {
msg := fmt.Sprintf("FIX DataPartition replicaNum,clusterID[%v] volName[%v] partitionID:%v orgReplicaNum:%v",
c.Name, vol.Name, partition.PartitionID, partition.ReplicaNum)
Warn(c.Name, msg)
if partition.isSpecialReplicaCnt() && partition.IsDecommissionFailed() { // case restart and no message left,delete the lasted replica be added
log.LogInfof("action[checkReplicaNum] volume %v partition %v need to lower replica", partition.VolName, partition.PartitionID)
vol.NeedToLowerReplica = true
return
}
}
if vol.dpReplicaNum != partition.ReplicaNum && !vol.NeedToLowerReplica {
log.LogDebugf("action[checkReplicaNum] volume %v partiton %v replicanum abnornal %v %v",
partition.VolName, partition.PartitionID, vol.dpReplicaNum, partition.ReplicaNum)
vol.NeedToLowerReplica = true
}
}
func (partition *DataPartition) hostsToString() (hosts string) {
return strings.Join(partition.Hosts, underlineSeparator)
}
func (partition *DataPartition) setToNormal() {
partition.Lock()
defer partition.Unlock()
partition.isRecover = false
}
func (partition *DataPartition) hasHost(addr string) (ok bool) {
for _, host := range partition.Hosts {
if host == addr {
ok = true
break
}
}
return
}
func (partition *DataPartition) liveReplicas(timeOutSec int64) (replicas []*DataReplica) {
replicas = make([]*DataReplica, 0)
for i := 0; i < len(partition.Replicas); i++ {
replica := partition.Replicas[i]
if replica.isLive(timeOutSec) && partition.hasHost(replica.Addr) {
replicas = append(replicas, replica)
}
}
return
}
// get all the live replicas from the persistent hosts
func (partition *DataPartition) getLiveReplicasFromHosts(timeOutSec int64) (replicas []*DataReplica) {
replicas = make([]*DataReplica, 0)
for _, host := range partition.Hosts {
replica, ok := partition.hasReplica(host)
if !ok {
continue
}
if replica.isLive(timeOutSec) == true {
replicas = append(replicas, replica)
}
}
return
}
// get all the live replicas from the persistent hosts
func (partition *DataPartition) getLiveReplicas(timeOutSec int64) (replicas []*DataReplica) {
replicas = make([]*DataReplica, 0)
for _, replica := range partition.Replicas {
if replica.isLive(timeOutSec) == true {
replicas = append(replicas, replica)
}
}
return
}
func (partition *DataPartition) checkAndRemoveMissReplica(addr string) {
if _, ok := partition.MissingNodes[addr]; ok {
delete(partition.MissingNodes, addr)
}
}
func (partition *DataPartition) loadFile(dataNode *DataNode, resp *proto.LoadDataPartitionResponse) {
partition.Lock()
defer partition.Unlock()
index, err := partition.getReplicaIndex(dataNode.Addr)
if err != nil {
msg := fmt.Sprintf("loadFile partitionID:%v on node:%v don't report :%v ", partition.PartitionID, dataNode.Addr, err)
log.LogWarn(msg)
return
}
replica := partition.Replicas[index]
for _, dpf := range resp.PartitionSnapshot {
if dpf == nil {
continue
}
fc, ok := partition.FileInCoreMap[dpf.Name]
if !ok {
fc = newFileInCore(dpf.Name)
partition.FileInCoreMap[dpf.Name] = fc
}
log.LogInfof("updateFileInCore partition %v", partition.PartitionID)
fc.updateFileInCore(partition.PartitionID, dpf, replica, index)
}
replica.HasLoadResponse = true
replica.Used = resp.Used
}
func (partition *DataPartition) getReplicaIndex(addr string) (index int, err error) {
for index = 0; index < len(partition.Replicas); index++ {
replica := partition.Replicas[index]
if replica.Addr == addr {
return
}
}
log.LogErrorf("action[getReplicaIndex],partitionID:%v,location:%v,err:%v",
partition.PartitionID, addr, dataReplicaNotFound(addr))
return -1, errors.Trace(dataReplicaNotFound(addr), "%v not found ", addr)
}
func (partition *DataPartition) update(action, volName string, newPeers []proto.Peer, newHosts []string, c *Cluster) (err error) {
if len(newHosts) == 0 {
log.LogErrorf("update. action[%v] update partition[%v] vol[%v] old host[%v]", action, partition.PartitionID, volName, partition.Hosts)
return
}
orgHosts := make([]string, len(partition.Hosts))
copy(orgHosts, partition.Hosts)
oldPeers := make([]proto.Peer, len(partition.Peers))
copy(oldPeers, partition.Peers)
partition.Hosts = newHosts
partition.Peers = newPeers
if err = c.syncUpdateDataPartition(partition); err != nil {
partition.Hosts = orgHosts
partition.Peers = oldPeers
return errors.Trace(err, "action[%v] update partition[%v] vol[%v] failed", action, partition.PartitionID, volName)
}
msg := fmt.Sprintf("action[%v] success,vol[%v] partitionID:%v "+
"oldHosts:%v newHosts:%v,oldPees[%v],newPeers[%v]",
action, volName, partition.PartitionID, orgHosts, partition.Hosts, oldPeers, partition.Peers)
log.LogWarnf(msg)
return
}
func (partition *DataPartition) updateMetric(vr *proto.DataPartitionReport, dataNode *DataNode, c *Cluster) {
if !partition.hasHost(dataNode.Addr) {
return
}
partition.Lock()
defer partition.Unlock()
replica, err := partition.getReplica(dataNode.Addr)
if err != nil {
replica = newDataReplica(dataNode)
partition.addReplica(replica)
}
partition.total = vr.Total
replica.Status = int8(vr.PartitionStatus)
replica.Total = vr.Total
replica.Used = vr.Used
partition.setMaxUsed()
replica.FileCount = uint32(vr.ExtentCount)
replica.setAlive()
replica.IsLeader = vr.IsLeader
if replica.IsLeader {
partition.LeaderReportTime = time.Now().Unix()
}
replica.NeedsToCompare = vr.NeedCompare
replica.DecommissionRepairProgress = vr.DecommissionRepairProgress
if replica.DiskPath != vr.DiskPath && vr.DiskPath != "" {
oldDiskPath := replica.DiskPath
replica.DiskPath = vr.DiskPath
err = c.syncUpdateDataPartition(partition)
if err != nil {
replica.DiskPath = oldDiskPath
}
}
partition.checkAndRemoveMissReplica(dataNode.Addr)
if replica.Status == proto.ReadWrite && (partition.RdOnly || replica.dataNode.RdOnly) {
replica.Status = int8(proto.ReadOnly)
}
}
func (partition *DataPartition) setMaxUsed() {
var maxUsed uint64
for _, r := range partition.Replicas {
if r.Used > maxUsed {
maxUsed = r.Used
}
}
partition.used = maxUsed
}
func (partition *DataPartition) getMaxUsedSpace() uint64 {
return partition.used
}
func (partition *DataPartition) afterCreation(nodeAddr, diskPath string, c *Cluster) (err error) {
dataNode, err := c.dataNode(nodeAddr)
if err != nil {
return err
}
replica := newDataReplica(dataNode)
if partition.IsDecommissionRunning() {
replica.Status = proto.Recovering
} else {
replica.Status = proto.Unavailable
}
replica.DiskPath = diskPath
replica.ReportTime = time.Now().Unix()
replica.Total = util.DefaultDataPartitionSize
partition.addReplica(replica)
partition.checkAndRemoveMissReplica(replica.Addr)
log.LogInfof("action[afterCreation] dp %v add new replica %v ", partition.PartitionID, dataNode.Addr)
return
}
// Check if it makes sense to compare the CRC.
// Note that if loading the data into a data node is not finished, then there is no need to check the CRC.
func (partition *DataPartition) needsToCompareCRC() (needCompare bool) {
partition.Lock()
defer partition.Unlock()
if partition.isRecover {
return false
}
needCompare = true
for _, replica := range partition.Replicas {
if !replica.NeedsToCompare {
needCompare = false
break
}
}
return
}
func (partition *DataPartition) containsBadDisk(diskPath string, nodeAddr string) bool {
partition.RLock()
defer partition.RUnlock()
for _, replica := range partition.Replicas {
if nodeAddr == replica.Addr && diskPath == replica.DiskPath {
return true
}
}
return false
}
func (partition *DataPartition) getReplicaDisk(nodeAddr string) string {
partition.RLock()
defer partition.RUnlock()
for _, replica := range partition.Replicas {
if nodeAddr == replica.Addr {
return replica.DiskPath
}
}
return ""
}
func (partition *DataPartition) getMinus() (minus float64) {
partition.RLock()
defer partition.RUnlock()
used := partition.Replicas[0].Used
for _, replica := range partition.Replicas {
if math.Abs(float64(replica.Used)-float64(used)) > minus {
minus = math.Abs(float64(replica.Used) - float64(used))
}
}
return minus
}
func (partition *DataPartition) activeUsedSimilar() bool {
partition.RLock()
defer partition.RUnlock()
liveReplicas := partition.liveReplicas(defaultDataPartitionTimeOutSec)
used := liveReplicas[0].Used
minus := float64(0)
for _, replica := range liveReplicas {
if math.Abs(float64(replica.Used)-float64(used)) > minus {
minus = math.Abs(float64(replica.Used) - float64(used))
}
}
return minus < util.GB
}
func (partition *DataPartition) getToBeDecommissionHost(replicaNum int) (host string) {
partition.RLock()
defer partition.RUnlock()
// if new replica is added success when failed(rollback failed with delete new replica timeout eg)
if partition.isSpecialReplicaCnt() &&
partition.GetSpecialReplicaDecommissionStep() >= SpecialDecommissionWaitAddRes &&
partition.IsDecommissionFailed() {
log.LogInfof("action[getToBeDecommissionHost] get single replica partition %v need to decommission %v",
partition.PartitionID, partition.DecommissionDstAddr)
host = partition.DecommissionDstAddr
return
}
hostLen := len(partition.Hosts)
if hostLen <= 1 || hostLen <= replicaNum {
return
}
host = partition.Hosts[hostLen-1]
return
}
func (partition *DataPartition) removeOneReplicaByHost(c *Cluster, host string, isReplicaNormal bool) (err error) {
if err = c.removeDataReplica(partition, host, false, false); err != nil {
return
}
partition.RLock()
defer partition.RUnlock()
//if partition.isSpecialReplicaCnt() && isReplicaNormal {
// partition.SingleDecommissionStatus = 0
// partition.SingleDecommissionAddr = ""
// return
//}
oldReplicaNum := partition.ReplicaNum
partition.ReplicaNum = partition.ReplicaNum - 1
if err = c.syncUpdateDataPartition(partition); err != nil {
partition.ReplicaNum = oldReplicaNum
}
return
}
func (partition *DataPartition) getNodeSets() (nodeSets []uint64) {
partition.RLock()
defer partition.RUnlock()
nodeSetMap := map[uint64]struct{}{}
for _, replica := range partition.Replicas {
if replica.dataNode == nil {
continue
}
nodeSetMap[replica.dataNode.NodeSetID] = struct{}{}
}
for nodeSet := range nodeSetMap {
nodeSets = append(nodeSets, nodeSet)
}
return
}
func (partition *DataPartition) getZones() (zones []string) {
partition.RLock()
defer partition.RUnlock()
zoneMap := map[string]struct{}{}
for _, replica := range partition.Replicas {
if replica.dataNode == nil {
continue
}
zoneMap[replica.dataNode.ZoneName] = struct{}{}
}
for zone := range zoneMap {
zones = append(zones, zone)
}
return
}
func (partition *DataPartition) getLiveZones(offlineAddr string) (zones []string) {
partition.RLock()
defer partition.RUnlock()
for _, replica := range partition.Replicas {
if replica.dataNode == nil {
continue
}
if replica.dataNode.Addr == offlineAddr {
continue
}
zones = append(zones, replica.dataNode.ZoneName)
}
return
}
func (partition *DataPartition) buildDpInfo(c *Cluster) *proto.DataPartitionInfo {
partition.RLock()
defer partition.RUnlock()
replicas := make([]*proto.DataReplica, len(partition.Replicas))
for i, replica := range partition.Replicas {
dataReplica := replica.DataReplica
dataReplica.DomainAddr = replica.dataNode.DomainAddr
replicas[i] = &dataReplica
}
fileInCoreMap := make(map[string]*proto.FileInCore)
for k, v := range partition.FileInCoreMap {
fileInCoreMap[k] = v.clone()
}
zones := make([]string, len(partition.Hosts))
nodeSets := make([]uint64, len(partition.Hosts))
for idx, host := range partition.Hosts {
dataNode, err := c.dataNode(host)
if err == nil {
zones[idx] = dataNode.ZoneName
nodeSets[idx] = dataNode.NodeSetID
}
}
forbidden := true
vol, err := c.getVol(partition.VolName)
if err == nil {
forbidden = vol.Forbidden
} else {
log.LogErrorf("action[buildDpInfo]failed to get volume %v, err %v", partition.VolName, err)
}
return &proto.DataPartitionInfo{
PartitionID: partition.PartitionID,
PartitionTTL: partition.PartitionTTL,
PartitionType: partition.PartitionType,
LastLoadedTime: partition.LastLoadedTime,
ReplicaNum: partition.ReplicaNum,
Status: partition.Status,
Replicas: replicas,
Hosts: partition.Hosts,
Peers: partition.Peers,
Zones: zones,
NodeSets: nodeSets,
MissingNodes: partition.MissingNodes,
VolName: partition.VolName,
VolID: partition.VolID,
FileInCoreMap: fileInCoreMap,
OfflinePeerID: partition.OfflinePeerID,
IsRecover: partition.isRecover,
FilesWithMissingReplica: partition.FilesWithMissingReplica,
IsDiscard: partition.IsDiscard,
SingleDecommissionStatus: partition.GetSpecialReplicaDecommissionStep(),
Forbidden: forbidden,
}
}
const (
DecommissionInitial uint32 = iota
markDecommission
DecommissionPause // can only stop markDecommission
DecommissionPrepare
DecommissionRunning
DecommissionSuccess
DecommissionFail
)
const (
SpecialDecommissionInitial uint32 = iota
SpecialDecommissionEnter
SpecialDecommissionWaitAddRes
SpecialDecommissionWaitAddResFin
SpecialDecommissionRemoveOld
)
const InvalidDecommissionDpCnt = -1
const (
defaultDecommissionParallelLimit = 10
defaultDecommissionRetryLimit = 5
defaultDecommissionRollbackLimit = 3
defaultDecommissionDiskParallelFactor = 0
)
func GetDecommissionStatusMessage(status uint32) string {
switch status {
case DecommissionInitial:
return "Initial"
case markDecommission:
return "Marked"
case DecommissionPause:
return "Paused"
case DecommissionRunning:
return "Running"
case DecommissionSuccess:
return "Success"
case DecommissionFail:
return "Failed"
default:
return "Unknown"
}
}
func (partition *DataPartition) MarkDecommissionStatus(srcAddr, dstAddr, srcDisk string, raftForce bool, term uint64, c *Cluster) bool {
if !partition.canMarkDecommission(term) {
log.LogWarnf("action[MarkDecommissionStatus] dp[%v] cannot make decommission:status[%v]",
partition.PartitionID, partition.GetDecommissionStatus())
return false
}
if partition.IsDecommissionPaused() {
if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, false, c) {
log.LogWarnf("action[MarkDecommissionStatus] dp [%d] recover from stop failed", partition.PartitionID)
return false
}
partition.SetDecommissionStatus(markDecommission)
// update decommissionTerm for next time query
partition.DecommissionTerm = term
return true
}
// initial or failed restart
partition.ResetDecommissionStatus()
partition.SetDecommissionStatus(markDecommission)
partition.DecommissionSrcAddr = srcAddr
partition.DecommissionDstAddr = dstAddr
partition.DecommissionSrcDiskPath = srcDisk
partition.DecommissionRaftForce = raftForce
partition.DecommissionTerm = term
// reset special replicas decommission status
partition.isRecover = false
partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
if partition.DecommissionSrcDiskPath == "" {
partition.RLock()
replica, _ := partition.getReplica(srcAddr)
partition.RUnlock()
if replica != nil {
partition.DecommissionSrcDiskPath = replica.DiskPath
}
}
if dstAddr != "" {
partition.DecommissionDstAddrSpecify = true
}
log.LogDebugf("action[MarkDecommissionStatus] dp[%v] SrcAddr %v, dstAddr %v, diskPath %v, raftForce %v term %v",
partition.PartitionID, partition.DecommissionSrcAddr, partition.DecommissionDstAddr,
partition.DecommissionSrcDiskPath, partition.DecommissionRaftForce, partition.DecommissionTerm)
return true
}
func (partition *DataPartition) SetDecommissionStatus(status uint32) {
atomic.StoreUint32(&partition.DecommissionStatus, status)
}
func (partition *DataPartition) SetSpecialReplicaDecommissionStep(step uint32) {
atomic.StoreUint32(&partition.SpecialReplicaDecommissionStep, step)
}
func (partition *DataPartition) GetDecommissionStatus() uint32 {
return atomic.LoadUint32(&partition.DecommissionStatus)
}
func (partition *DataPartition) GetSpecialReplicaDecommissionStep() uint32 {
return atomic.LoadUint32(&partition.SpecialReplicaDecommissionStep)
}
func (partition *DataPartition) IsDecommissionSuccess() bool {
return partition.GetDecommissionStatus() == DecommissionSuccess
}
func (partition *DataPartition) IsDecommissionFailed() bool {
return partition.GetDecommissionStatus() == DecommissionFail
}
func (partition *DataPartition) IsDecommissionRunning() bool {
return partition.GetDecommissionStatus() == DecommissionRunning
}
func (partition *DataPartition) IsDecommissionPrepare() bool {
return partition.GetDecommissionStatus() == DecommissionPrepare
}
func (partition *DataPartition) IsDecommissionPaused() bool {
return partition.GetDecommissionStatus() == DecommissionPause
}
func (partition *DataPartition) IsDecommissionInitial() bool {
return partition.GetDecommissionStatus() == DecommissionInitial
}
func (partition *DataPartition) IsMarkDecommission() bool {
return partition.GetDecommissionStatus() == markDecommission
}
func (partition *DataPartition) IsDoingDecommission() bool {
decommStatus := partition.GetDecommissionStatus()
return (decommStatus > DecommissionInitial && decommStatus < DecommissionSuccess)
}
func (partition *DataPartition) TryToDecommission(c *Cluster) bool {
if !partition.IsMarkDecommission() {
log.LogWarnf("action[TryToDecommission] failed dp[%v] status expected markDecommission[%v]",
partition.PartitionID, atomic.LoadUint32(&partition.DecommissionStatus))
return false
}
log.LogDebugf("action[TryToDecommission] dp[%v]", partition.PartitionID)
return partition.Decommission(c)
}
func (partition *DataPartition) Decommission(c *Cluster) bool {
var (
msg string
err error
srcAddr = partition.DecommissionSrcAddr
targetAddr = partition.DecommissionDstAddr
)
defer func() {
c.syncUpdateDataPartition(partition)
}()
log.LogInfof("action[decommissionDataPartition] dp[%v] from node[%v] to node[%v], raftForce[%v] SingleDecommissionStatus[%v]",
partition.PartitionID, srcAddr, targetAddr, partition.DecommissionRaftForce, partition.GetSpecialReplicaDecommissionStep())
begin := time.Now()
partition.SetDecommissionStatus(DecommissionPrepare)
err = c.syncUpdateDataPartition(partition)
if err != nil {
log.LogWarnf("action[decommissionDataPartition] dp [%v] update to prepare failed", partition.PartitionID)
goto errHandler
}
// delete if not normal data partition
if !proto.IsNormalDp(partition.PartitionType) {
c.vols[partition.VolName].deleteDataPartition(c, partition)
partition.SetDecommissionStatus(DecommissionSuccess)
log.LogWarnf("action[decommissionDataPartition]delete dp directly[%v]", partition.PartitionID)
return true
}
if err = c.validateDecommissionDataPartition(partition, srcAddr); err != nil {
goto errHandler
}
err = c.updateDataNodeSize(targetAddr, partition)
if err != nil {
log.LogWarnf("action[decommissionDataPartition] target addr can't be writable, add %s %s", targetAddr, err.Error())
goto errHandler
}
defer func() {
if err != nil {
c.returnDataSize(targetAddr, partition)
}
}()
// if single/two replica without raftforce
if partition.isSpecialReplicaCnt() && !partition.DecommissionRaftForce {
if partition.GetSpecialReplicaDecommissionStep() == SpecialDecommissionInitial {
partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionEnter)
}
if err = c.decommissionSingleDp(partition, targetAddr, srcAddr); err != nil {
goto errHandler
}
} else {
if err = c.removeDataReplica(partition, srcAddr, false, partition.DecommissionRaftForce); err != nil {
goto errHandler
}
if err = c.addDataReplica(partition, targetAddr); err != nil {
goto errHandler
}
newReplica, _ := partition.getReplica(targetAddr)
newReplica.Status = proto.Recovering // in case heartbeat response is not arrived
partition.isRecover = true
partition.Status = proto.ReadOnly
partition.SetDecommissionStatus(DecommissionRunning)
partition.RecoverStartTime = time.Now()
c.putBadDataPartitionIDsByDiskPath(partition.DecommissionSrcDiskPath, partition.DecommissionSrcAddr, partition.PartitionID)
}
// only stop 3-replica,need to release token
if partition.IsDecommissionPaused() {
log.LogInfof("action[decommissionDataPartition]clusterID[%v] partitionID:%v decommission paused", c.Name, partition.PartitionID)
if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, true, c) {
log.LogWarnf("action[decommissionDataPartition]clusterID[%v] partitionID:%v paused failed", c.Name, partition.PartitionID)
}
return true
} else {
log.LogInfof("action[decommissionDataPartition]clusterID[%v] partitionID:%v "+
"on node:%v offline success,newHost[%v],PersistenceHosts:[%v], SingleDecommissionStatus[%v]prepare consume[%v]seconds",
c.Name, partition.PartitionID, srcAddr, targetAddr, partition.Hosts, partition.GetSpecialReplicaDecommissionStep(), time.Since(begin).Seconds())
return true
}
errHandler:
// special replica num receive stop signal,donot reset SingleDecommissionStatus for decommission again
if partition.GetDecommissionStatus() == DecommissionPause {
log.LogWarnf("action[decommissionDataPartition] partitionID:%v is stopped", partition.PartitionID)
return true
}
partition.DecommissionRetry++
if partition.DecommissionRetry >= defaultDecommissionRetryLimit {
partition.SetDecommissionStatus(DecommissionFail)
} else {
partition.SetDecommissionStatus(markDecommission) // retry again
partition.DecommissionWaitTimes = 0
}
// if need rollback, set to fail,reset DecommissionDstAddr
if partition.DecommissionNeedRollback {
partition.SetDecommissionStatus(DecommissionFail)
}
msg = fmt.Sprintf("clusterID[%v] vol[%v] partitionID[%v] on Node:%v "+
"to newHost:%v Err:%v, PersistenceHosts:%v ,retry %v,status %v, isRecover %v SingleDecommissionStatus[%v]"+
" DecommissionNeedRollback[%v]",
c.Name, partition.VolName, partition.PartitionID, srcAddr, targetAddr, err.Error(),
partition.Hosts, partition.DecommissionRetry, partition.GetDecommissionStatus(),
partition.isRecover, partition.GetSpecialReplicaDecommissionStep(), partition.DecommissionNeedRollback)
Warn(c.Name, msg)
log.LogWarnf("action[decommissionDataPartition] %s", msg)
return false
}
func (partition *DataPartition) PauseDecommission(c *Cluster) bool {
status := partition.GetDecommissionStatus()
// support retry pause if pause failed last time
if status == DecommissionInitial || status == DecommissionSuccess ||
status == DecommissionFail {
log.LogWarnf("action[PauseDecommission] dp[%v] cannot be stopped status[%v]", partition.PartitionID, status)
return true
}
defer c.syncUpdateDataPartition(partition)
log.LogDebugf("action[PauseDecommission] dp[%v] status %v set to stop ",
partition.PartitionID, partition.GetDecommissionStatus())
if status == markDecommission {
partition.SetDecommissionStatus(DecommissionPause)
return true
}
if partition.isSpecialReplicaCnt() {
log.LogDebugf("action[PauseDecommission]special replica dp[%v] status[%v]",
partition.PartitionID, partition.GetSpecialReplicaDecommissionStep())
partition.SpecialReplicaDecommissionStop <- false
// if special replica is repairing, stop the process
if partition.GetSpecialReplicaDecommissionStep() == SpecialDecommissionWaitAddRes {
if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, true, c) {
return false
}
}
} else {
if partition.IsDecommissionRunning() {
if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, true, c) {
return false
}
log.LogDebugf("action[PauseDecommission] dp[%v] status [%v] send stop signal ",
partition.PartitionID, partition.GetDecommissionStatus())
}
}
partition.SetDecommissionStatus(DecommissionPause)
partition.isRecover = false
return true
}
func (partition *DataPartition) ResetDecommissionStatus() {
partition.DecommissionDstAddr = ""
partition.DecommissionSrcAddr = ""
partition.DecommissionRetry = 0
partition.DecommissionRaftForce = false
partition.DecommissionSrcDiskPath = ""
partition.isRecover = false
partition.DecommissionTerm = 0
partition.DecommissionDstAddrSpecify = false
partition.DecommissionNeedRollback = false
partition.DecommissionNeedRollbackTimes = 0
partition.SetDecommissionStatus(DecommissionInitial)
partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
partition.DecommissionWaitTimes = 0
}
func (partition *DataPartition) rollback(c *Cluster) {
// del new add replica,may timeout, try rollback next time
err := c.removeDataReplica(partition, partition.DecommissionDstAddr, false, false)
if err != nil {
// keep decommission status to failed for rollback
log.LogWarnf("action[rollback]dp[%v] rollback to del replica[%v] failed:%v",
partition.PartitionID, partition.DecommissionDstAddr, err.Error())
return
}
err = partition.restoreReplicaMeta(c)
if err != nil {
return
}
// release token first
partition.ReleaseDecommissionToken(c)
// reset status if rollback success
partition.DecommissionDstAddr = ""
partition.DecommissionRetry = 0
partition.isRecover = false
partition.DecommissionNeedRollback = false
partition.DecommissionWaitTimes = 0
partition.SetDecommissionStatus(markDecommission)
partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
c.syncUpdateDataPartition(partition)
log.LogWarnf("action[rollback]dp[%v] rollback success", partition.PartitionID)
return
}
func (partition *DataPartition) addToDecommissionList(c *Cluster) {
if partition.DecommissionSrcAddr == "" {
return
}
var (
dataNode *DataNode
zone *Zone
ns *nodeSet
err error
)
if dataNode, err = c.dataNode(partition.DecommissionSrcAddr); err != nil {
log.LogWarnf("action[addToDecommissionList]find dp[%v] src decommission dataNode [%v] failed[%v]",
partition.PartitionID, partition.DecommissionSrcAddr, err.Error())
return
}
if dataNode.ZoneName == "" {
log.LogWarnf("action[addToDecommissionList]dataNode[%v] zone is nil", dataNode.Addr)
return
}
if zone, err = c.t.getZone(dataNode.ZoneName); err != nil {
log.LogWarnf("action[addToDecommissionList]dataNode[%v] zone is nil:%v", dataNode.Addr, err.Error())
return
}
if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil {
log.LogWarnf("action[addToDecommissionList]dataNode[%v] nodeSet is nil:%v", dataNode.Addr, err.Error())
return
}
ns.AddToDecommissionDataPartitionList(partition, c)
log.LogDebugf("action[addToDecommissionList]dp[%v] decommission src[%v] Disk[%v] dst[%v] status[%v] specialStep[%v],"+
" add to decommission list[%v] ",
partition.PartitionID, partition.DecommissionSrcAddr, partition.DecommissionSrcDiskPath,
partition.DecommissionDstAddr, partition.GetDecommissionStatus(), partition.GetSpecialReplicaDecommissionStep(), ns.ID)
}
func (partition *DataPartition) checkConsumeToken() bool {
if partition.GetDecommissionStatus() == DecommissionRunning {
return true
}
return false
}
// only mark stop status or initial
func (partition *DataPartition) canMarkDecommission(term uint64) bool {
// dp may not be reset decommission status from last decommission
if partition.DecommissionTerm != term {
return true
}
status := partition.GetDecommissionStatus()
if status == DecommissionInitial ||
status == DecommissionPause ||
status == DecommissionFail {
return true
}
return false
}
func (partition *DataPartition) canAddToDecommissionList() bool {
status := partition.GetDecommissionStatus()
if status == DecommissionInitial ||
status == DecommissionPause ||
status == DecommissionSuccess ||
status == DecommissionFail {
return false
}
return true
}
func (partition *DataPartition) tryRollback(c *Cluster) bool {
if !partition.needRollback(c) {
return false
}
partition.DecommissionNeedRollbackTimes++
partition.rollback(c)
return true
}
func (partition *DataPartition) pauseReplicaRepair(replicaAddr string, stop bool, c *Cluster) bool {
index := partition.findReplica(replicaAddr)
if index == -1 {
log.LogWarnf("action[pauseReplicaRepair]dp[%v] can't find replica %v", partition.PartitionID, replicaAddr)
// maybe paused from rollback[mark]
return true
}
const RetryMax = 5
var (
dataNode *DataNode
err error
retry = 0
)
for retry <= RetryMax {
if dataNode, err = c.dataNode(replicaAddr); err != nil {
retry++
time.Sleep(time.Second)
log.LogWarnf("action[pauseReplicaRepair]dp[%v] can't find dataNode %v", partition.PartitionID, partition.DecommissionSrcAddr)
continue
}
task := partition.createTaskToStopDataPartitionRepair(replicaAddr, stop)
packet, err := dataNode.TaskManager.syncSendAdminTask(task)
if err != nil {
retry++
time.Sleep(time.Second)
log.LogWarnf("action[pauseReplicaRepair]dp[%v] send stop task failed %v", partition.PartitionID, err.Error())
continue
}
if !stop {
partition.RecoverStartTime = time.Now().Add(-partition.RecoverLastConsumeTime)
partition.RecoverLastConsumeTime = time.Duration(0)
log.LogDebugf("action[pauseReplicaRepair]dp[%v] replica %v RecoverStartTime sub %v seconds",
partition.PartitionID, replicaAddr, partition.RecoverLastConsumeTime.Seconds())
} else {
partition.RecoverLastConsumeTime = time.Now().Sub(partition.RecoverStartTime)
log.LogDebugf("action[pauseReplicaRepair]dp[%v] replica %v already recover %v seconds",
partition.PartitionID, replicaAddr, partition.RecoverLastConsumeTime.Seconds())
}
log.LogDebugf("action[pauseReplicaRepair]dp[%v] send stop to replica %v packet %v", partition.PartitionID, replicaAddr, packet)
return true
}
return false
}
func (partition *DataPartition) findReplica(replicaAddr string) int {
partition.Lock()
defer partition.Unlock()
var (
replica *DataReplica
index = -1
)
for i := 0; i < len(partition.Replicas); i++ {
replica = partition.Replicas[i]
if replica.Addr == replicaAddr {
index = i
break
}
}
return index
}
func (partition *DataPartition) createTaskToStopDataPartitionRepair(addr string, stop bool) (task *proto.AdminTask) {
task = proto.NewAdminTask(proto.OpStopDataPartitionRepair, addr, newStopDataPartitionRepairRequest(partition.PartitionID, stop))
partition.resetTaskID(task)
return
}
func (partition *DataPartition) TryAcquireDecommissionToken(c *Cluster) bool {
var (
zone *Zone
ns *nodeSet
err error
targetHosts []string
excludeNodeSets []uint64
zones []string
)
const MaxRetryDecommissionWait = 60
defer c.syncUpdateDataPartition(partition)
if partition.DecommissionRetry > 0 {
partition.DecommissionWaitTimes++
if partition.DecommissionWaitTimes < MaxRetryDecommissionWait {
// log.LogDebugf("action[TryAcquireDecommissionToken] dp %v wait %v", partition.PartitionID, partition.DecommissionWaitTimes)
return false
} else {
partition.DecommissionWaitTimes = 0
}
}
// the first time for dst addr not specify
if !partition.DecommissionDstAddrSpecify && partition.DecommissionDstAddr == "" {
// try to find available data node in src nodeset
ns, zone, err = getTargetNodeset(partition.DecommissionSrcAddr, c)
if err != nil {
log.LogWarnf("action[TryAcquireDecommissionToken] dp %v find src nodeset failed:%v",
partition.PartitionID, err.Error())
goto errHandler
}
targetHosts, _, err = ns.getAvailDataNodeHosts(partition.Hosts, 1)
if err != nil {
log.LogWarnf("action[TryAcquireDecommissionToken] dp %v choose from src nodeset failed:%v",
partition.PartitionID, err.Error())
if _, ok := c.vols[partition.VolName]; !ok {
log.LogWarnf("action[TryAcquireDecommissionToken] dp %v cannot find vol:%v",
partition.PartitionID, err.Error())
goto errHandler
}
if c.isFaultDomain(c.vols[partition.VolName]) {
log.LogWarnf("action[TryAcquireDecommissionToken] dp %v is fault domain",
partition.PartitionID)
goto errHandler
}
excludeNodeSets = append(excludeNodeSets, ns.ID)
if targetHosts, _, err = zone.getAvailNodeHosts(TypeDataPartition, excludeNodeSets, partition.Hosts, 1); err != nil {
// select data nodes from the other zone
zones = partition.getLiveZones(partition.DecommissionSrcAddr)
var excludeZone []string
if len(zones) == 0 {
excludeZone = append(excludeZone, zone.name)
} else {
excludeZone = append(excludeZone, zones[0])
}
if targetHosts, _, err = c.getHostFromNormalZone(TypeDataPartition, excludeZone, excludeNodeSets, partition.Hosts, 1, 1, ""); err != nil {
log.LogWarnf("action[TryAcquireDecommissionToken] dp %v getHostFromNormalZone failed:%v",
partition.PartitionID, err.Error())
goto errHandler
}
}
// get nodeset for target host
newAddr := targetHosts[0]
ns, zone, err = getTargetNodeset(newAddr, c)
if err != nil {
log.LogWarnf("action[TryAcquireDecommissionToken] dp %v find new nodeset failed:%v",
partition.PartitionID, err.Error())
goto errHandler
}
}
// only persist DecommissionDstAddr when get token
if ns.AcquireDecommissionToken(partition.PartitionID) {
partition.DecommissionDstAddr = targetHosts[0]
log.LogDebugf("action[TryAcquireDecommissionToken] dp %v get token from %v nodeset %v success",
partition.PartitionID, partition.DecommissionDstAddr, ns.ID)
return true
} else {
log.LogDebugf("action[TryAcquireDecommissionToken] dp %v: nodeset %v token is empty",
partition.PartitionID, ns.ID)
return false
}
} else {
ns, zone, err = getTargetNodeset(partition.DecommissionDstAddr, c)
if err != nil {
log.LogWarnf("action[TryAcquireDecommissionToken]dp %v find src nodeset failed:%v",
partition.PartitionID, err.Error())
goto errHandler
}
if ns.AcquireDecommissionToken(partition.PartitionID) {
log.LogDebugf("action[TryAcquireDecommissionToken]dp %v get token from %v nodeset %v success",
partition.PartitionID, partition.DecommissionDstAddr, ns.ID)
return true
} else {
return false
}
}
errHandler:
partition.DecommissionRetry++
if partition.DecommissionRetry >= defaultDecommissionRetryLimit {
partition.SetDecommissionStatus(DecommissionFail)
} else {
partition.DecommissionWaitTimes = 0
}
log.LogWarnf("action[TryAcquireDecommissionToken] clusterID[%v] vol[%v] partitionID[%v]"+
" retry [%v] status [%v] DecommissionDstAddrSpecify [%v] DecommissionDstAddr [%v] failed",
c.Name, partition.VolName, partition.PartitionID, partition.DecommissionRetry, partition.GetDecommissionStatus(),
partition.DecommissionDstAddrSpecify, partition.DecommissionDstAddr)
return false
}
func (partition *DataPartition) ReleaseDecommissionToken(c *Cluster) {
if partition.DecommissionDstAddr == "" {
return
}
if ns, _, err := getTargetNodeset(partition.DecommissionDstAddr, c); err != nil {
log.LogWarnf("action[ReleaseDecommissionToken]should never happen dp %v:%v", partition.PartitionID, err.Error())
return
} else {
ns.ReleaseDecommissionToken(partition.PartitionID)
}
}
//func (partition *DataPartition) ShouldReleaseDecommissionTokenByStop(c *Cluster) {
// if partition.DecommissionDstAddr == "" && !partition.DecommissionDstAddrSpecify {
// return
// }
// index := partition.findReplica(partition.DecommissionDstAddr)
// if index == -1 {
// log.LogWarnf("action[ShouldReleaseDecommissionTokenByStop]dp[%v] has not added replica %v",
// partition.PartitionID, partition.DecommissionDstAddr)
// }
// partition.ReleaseDecommissionToken(c)
//}
func (partition *DataPartition) restoreReplicaMeta(c *Cluster) (err error) {
//dst has
//dstDataNode, err := c.dataNode(partition.DecommissionDstAddr)
//if err != nil {
// log.LogWarnf("action[restoreReplicaMeta]partition %v find dst %v data node failed:%v",
// partition.PartitionID, partition.DecommissionDstAddr, err.Error())
// return
//}
//removePeer := proto.Peer{ID: dstDataNode.ID, Addr: partition.DecommissionDstAddr}
//if err = c.removeHostMember(partition, removePeer); err != nil {
// log.LogWarnf("action[restoreReplicaMeta]partition %v metadata removeReplica %v failed:%v",
// partition.PartitionID, partition.DecommissionDstAddr, err.Error())
// return
//}
srcDataNode, err := c.dataNode(partition.DecommissionSrcAddr)
if err != nil {
log.LogWarnf("action[restoreReplicaMeta]partition %v find src %v data node failed:%v",
partition.PartitionID, partition.DecommissionSrcAddr, err.Error())
return
}
addPeer := proto.Peer{ID: srcDataNode.ID, Addr: partition.DecommissionSrcAddr}
if err = c.addDataPartitionRaftMember(partition, addPeer); err != nil {
log.LogWarnf("action[restoreReplicaMeta]partition %v metadata addReplica %v failed:%v",
partition.PartitionID, partition.DecommissionSrcAddr, err.Error())
return
}
log.LogDebugf("action[restoreReplicaMeta]partition %v meta data has restored:hosts [%v] peers[%v]",
partition.PartitionID, partition.Hosts, partition.Peers)
return
}
func getTargetNodeset(addr string, c *Cluster) (ns *nodeSet, zone *Zone, err error) {
var dataNode *DataNode
dataNode, err = c.dataNode(addr)
if err != nil {
log.LogWarnf("action[getTargetNodeset] find src %v data node failed:%v", addr, err.Error())
return nil, nil, err
}
zone, err = c.t.getZone(dataNode.ZoneName)
if err != nil {
log.LogWarnf("action[getTargetNodeset] find src %v zone failed:%v", addr, err.Error())
return nil, nil, err
}
ns, err = zone.getNodeSet(dataNode.NodeSetID)
if err != nil {
log.LogWarnf("action[getTargetNodeset] find src %v nodeset failed:%v", addr, err.Error())
return nil, nil, err
}
return ns, zone, nil
}
func (partition *DataPartition) needRollback(c *Cluster) bool {
log.LogDebugf("action[needRollback]dp[%v]DecommissionNeedRollbackTimes[%v]", partition.PartitionID, partition.DecommissionNeedRollbackTimes)
// failed by error except add replica or create dp or repair dp
if !partition.DecommissionNeedRollback {
return false
}
// specify dst addr do not need rollback
if partition.DecommissionDstAddrSpecify {
log.LogWarnf("action[needRollback]dp[%v] do not rollback for DecommissionDstAddrSpecify", partition.PartitionID)
return false
}
if partition.DecommissionNeedRollbackTimes >= defaultDecommissionRollbackLimit {
log.LogDebugf("action[needRollback]try add restore replica, dp[%v]DecommissionNeedRollbackTimes[%v]",
partition.PartitionID, partition.DecommissionNeedRollbackTimes)
partition.DecommissionNeedRollback = false
err := c.addDataReplica(partition, partition.DecommissionSrcAddr)
if err != nil {
log.LogWarnf("action[needRollback]dp[%v] recover decommission src replica %v failed: %v",
partition.PartitionID, partition.DecommissionSrcAddr, err)
}
err = c.removeDataReplica(partition, partition.DecommissionDstAddr, false, false)
if err != nil {
log.LogWarnf("action[needRollback]dp[%v] remove decommission dst replica %v failed: %v",
partition.PartitionID, partition.DecommissionDstAddr, err)
}
return false
}
return true
}
func (partition *DataPartition) restoreReplica(c *Cluster) {
var err error
err = c.removeDataReplica(partition, partition.DecommissionDstAddr, false, false)
if err != nil {
log.LogWarnf("action[restoreReplica]dp[%v] rollback to del replica[%v] failed:%v",
partition.PartitionID, partition.DecommissionDstAddr, err.Error())
} else {
log.LogDebugf("action[restoreReplica]dp[%v] rollback to del replica[%v] success",
partition.PartitionID, partition.DecommissionDstAddr)
}
err = c.addDataReplica(partition, partition.DecommissionSrcAddr)
if err != nil {
log.LogWarnf("action[restoreReplica]dp[%v] recover decommission src replica failed", partition.PartitionID)
} else {
log.LogDebugf("action[restoreReplica]dp[%v] rollback to add replica[%v] success",
partition.PartitionID, partition.DecommissionSrcAddr)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"strconv"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
func (partition *DataPartition) checkStatus(clusterName string, needLog bool, dpTimeOutSec int64, c *Cluster,
shouldDpInhibitWriteByVolFull bool, forbiddenVol bool) {
partition.Lock()
defer partition.Unlock()
var liveReplicas []*DataReplica
if proto.IsNormalDp(partition.PartitionType) {
liveReplicas = partition.getLiveReplicasFromHosts(dpTimeOutSec)
if len(partition.Replicas) > len(partition.Hosts) {
partition.Status = proto.ReadOnly
msg := fmt.Sprintf("action[extractStatus],partitionID:%v has exceed repica, replicaNum:%v liveReplicas:%v Status:%v RocksDBHost:%v ",
partition.PartitionID, partition.ReplicaNum, len(liveReplicas), partition.Status, partition.Hosts)
Warn(clusterName, msg)
return
}
} else {
liveReplicas = partition.getLiveReplicas(dpTimeOutSec)
}
switch len(liveReplicas) {
case (int)(partition.ReplicaNum):
partition.Status = proto.ReadOnly
if partition.checkReplicaEqualStatus(liveReplicas, proto.ReadWrite) &&
partition.hasEnoughAvailableSpace() &&
!shouldDpInhibitWriteByVolFull {
writable := false
if proto.IsNormalDp(partition.PartitionType) {
if partition.getLeaderAddr() != "" {
writable = true
}
} else {
// cold volume has no leader
writable = true
}
// if the volume is not forbidden
// set status to ReadWrite
if writable && !forbiddenVol {
partition.Status = proto.ReadWrite
}
}
default:
partition.Status = proto.ReadOnly
}
// keep readonly if special replica is still decommission
if partition.isSpecialReplicaCnt() && partition.GetSpecialReplicaDecommissionStep() > 0 {
log.LogInfof("action[checkStatus] partition %v with Special replica cnt %v on decommison status %v, live replicacnt %v",
partition.PartitionID, partition.ReplicaNum, partition.Status, len(liveReplicas))
partition.Status = proto.ReadOnly
}
if partition.checkReplicaEqualStatus(liveReplicas, proto.Unavailable) {
log.LogWarnf("action[checkStatus] partition %v bet set Unavailable", partition.PartitionID)
partition.Status = proto.Unavailable
}
if needLog == true && len(liveReplicas) != int(partition.ReplicaNum) {
msg := fmt.Sprintf("action[extractStatus],partitionID:%v replicaNum:%v liveReplicas:%v Status:%v RocksDBHost:%v ",
partition.PartitionID, partition.ReplicaNum, len(liveReplicas), partition.Status, partition.Hosts)
log.LogInfo(msg)
if time.Now().Unix()-partition.lastWarnTime > intervalToWarnDataPartition {
Warn(clusterName, msg)
partition.lastWarnTime = time.Now().Unix()
}
}
}
func (partition *DataPartition) hasEnoughAvailableSpace() bool {
avail := partition.total - partition.used
if int64(avail) > 10*util.GB {
return true
}
return false
}
func (partition *DataPartition) checkReplicaNotHaveStatus(liveReplicas []*DataReplica, status int8) (equal bool) {
for _, replica := range liveReplicas {
if replica.Status == status {
log.LogInfof("action[checkReplicaNotHaveStatus] partition %v replica %v status %v dst status %v",
partition.PartitionID, replica.Addr, replica.Status, status)
return
}
}
return true
}
func (partition *DataPartition) checkReplicaEqualStatus(liveReplicas []*DataReplica, status int8) (equal bool) {
for _, replica := range liveReplicas {
if replica.Status != status {
log.LogDebugf("action[checkReplicaEqualStatus] partition %v replica %v status %v dst status %v",
partition.PartitionID, replica.Addr, replica.Status, status)
return
}
}
return true
}
func (partition *DataPartition) checkReplicaStatus(timeOutSec int64) {
partition.Lock()
defer partition.Unlock()
for _, replica := range partition.Replicas {
if !replica.isLive(timeOutSec) {
log.LogInfof("action[checkReplicaStatus] partition %v replica %v be set status ReadOnly", partition.PartitionID, replica.Addr)
if replica.Status == proto.ReadWrite {
replica.Status = proto.ReadOnly
}
if partition.isSpecialReplicaCnt() {
return
}
continue
}
if (replica.dataNode.RdOnly || partition.RdOnly) && replica.Status == proto.ReadWrite {
replica.Status = proto.ReadOnly
}
}
}
func (partition *DataPartition) checkLeader(clusterID string, timeOut int64) {
partition.Lock()
defer partition.Unlock()
for _, dr := range partition.Replicas {
if !dr.isLive(timeOut) {
dr.IsLeader = false
}
}
if !proto.IsNormalDp(partition.PartitionType) {
return
}
var report bool
if partition.getLeaderAddr() == "" {
report = true
}
if WarnMetrics != nil {
WarnMetrics.WarnDpNoLeader(clusterID, partition.PartitionID, report)
}
return
}
// Check if there is any missing replica for a data partition.
func (partition *DataPartition) checkMissingReplicas(clusterID, leaderAddr string, dataPartitionMissSec, dataPartitionWarnInterval int64) {
partition.Lock()
defer partition.Unlock()
id := strconv.FormatUint(partition.PartitionID, 10)
_, ok := WarnMetrics.dpMissingReplicaInfo[id]
oldMissingReplicaNum := 0
if ok {
oldMissingReplicaNum = len(WarnMetrics.dpMissingReplicaInfo[id].addrs)
}
for _, replica := range partition.Replicas {
if partition.hasHost(replica.Addr) && replica.isMissing(dataPartitionMissSec) && !partition.IsDiscard {
if partition.needToAlarmMissingDataPartition(replica.Addr, dataPartitionWarnInterval) {
dataNode := replica.getReplicaNode()
var lastReportTime time.Time
isActive := true
if dataNode != nil {
lastReportTime = dataNode.ReportTime
isActive = dataNode.isActive
}
msg := fmt.Sprintf("action[checkMissErr],clusterID[%v] paritionID:%v on node:%v "+
"miss time > %v lastRepostTime:%v dnodeLastReportTime:%v nodeisActive:%v So Migrate by manual",
clusterID, partition.PartitionID, replica.Addr, dataPartitionMissSec, replica.ReportTime, lastReportTime, isActive)
// msg = msg + fmt.Sprintf(" decommissionDataPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, partition.PartitionID, replica.Addr)
Warn(clusterID, msg)
if WarnMetrics != nil {
WarnMetrics.WarnMissingDp(clusterID, replica.Addr, partition.PartitionID, true)
}
}
} else {
if WarnMetrics != nil {
WarnMetrics.WarnMissingDp(clusterID, replica.Addr, partition.PartitionID, false)
}
}
}
if WarnMetrics != nil {
WarnMetrics.CleanObsoleteDpMissing(clusterID, partition)
}
WarnMetrics.dpMissingReplicaMutex.Lock()
replicaInfo, ok := WarnMetrics.dpMissingReplicaInfo[id]
if ok {
MissingReplicaNum := len(replicaInfo.addrs)
oldDpReplicaAliveNum := ""
if MissingReplicaNum != oldMissingReplicaNum && oldMissingReplicaNum != 0 {
oldDpReplicaAliveNum = WarnMetrics.dpMissingReplicaInfo[id].replicaAlive
}
dpReplicaMissingNum := uint8(len(WarnMetrics.dpMissingReplicaInfo[id].addrs))
dpReplicaAliveNum := partition.ReplicaNum - dpReplicaMissingNum
replicaInfo.replicaNum = strconv.FormatUint(uint64(partition.ReplicaNum), 10)
replicaInfo.replicaAlive = strconv.FormatUint(uint64(dpReplicaAliveNum), 10)
WarnMetrics.dpMissingReplicaInfo[id] = replicaInfo
for missingReplicaAddr := range WarnMetrics.dpMissingReplicaInfo[id].addrs {
if oldDpReplicaAliveNum != "" {
WarnMetrics.missingDp.DeleteLabelValues(clusterID, id, missingReplicaAddr, oldDpReplicaAliveNum, replicaInfo.replicaNum)
}
WarnMetrics.missingDp.SetWithLabelValues(1, clusterID, id, missingReplicaAddr, replicaInfo.replicaAlive, replicaInfo.replicaNum)
}
}
WarnMetrics.dpMissingReplicaMutex.Unlock()
if !proto.IsNormalDp(partition.PartitionType) {
return
}
for _, addr := range partition.Hosts {
if partition.hasMissingDataPartition(addr) && partition.needToAlarmMissingDataPartition(addr, dataPartitionWarnInterval) {
msg := fmt.Sprintf("action[checkMissErr],clusterID[%v] partitionID:%v on node:%v "+
"miss time > :%v but server not exsit So Migrate", clusterID, partition.PartitionID, addr, dataPartitionMissSec)
msg = msg + fmt.Sprintf(" decommissionDataPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, partition.PartitionID, addr)
Warn(clusterID, msg)
}
}
}
func (partition *DataPartition) needToAlarmMissingDataPartition(addr string, interval int64) (shouldAlarm bool) {
t, ok := partition.MissingNodes[addr]
if !ok {
partition.MissingNodes[addr] = time.Now().Unix()
shouldAlarm = true
} else {
if time.Now().Unix()-t > interval {
shouldAlarm = true
partition.MissingNodes[addr] = time.Now().Unix()
}
}
return
}
func (partition *DataPartition) hasMissingDataPartition(addr string) (isMissing bool) {
_, ok := partition.hasReplica(addr)
if !ok {
isMissing = true
}
return
}
func (partition *DataPartition) checkDiskError(clusterID, leaderAddr string) {
diskErrorAddrs := make(map[string]string, 0)
partition.Lock()
defer partition.Unlock()
for _, addr := range partition.Hosts {
replica, ok := partition.hasReplica(addr)
if !ok {
continue
}
if replica.Status == proto.Unavailable {
if partition.isSpecialReplicaCnt() && len(partition.Hosts) > 1 {
log.LogWarnf("action[%v],clusterID[%v],partitionID:%v On :%v status Unavailable",
checkDataPartitionDiskErr, clusterID, partition.PartitionID, addr)
continue
}
diskErrorAddrs[replica.Addr] = replica.DiskPath
}
}
if len(diskErrorAddrs) != (int)(partition.ReplicaNum) && len(diskErrorAddrs) > 0 {
partition.Status = proto.ReadOnly
}
for addr, diskPath := range diskErrorAddrs {
msg := fmt.Sprintf("action[%v],clusterID[%v],partitionID:%v On :%v Disk Error,So Remove it From RocksDBHost, decommissionDiskURL is http://%v/disk/decommission?addr=%v&disk=%v",
checkDataPartitionDiskErr, clusterID, partition.PartitionID, addr, leaderAddr, addr, diskPath)
Warn(clusterID, msg)
}
return
}
func (partition *DataPartition) checkReplicationTask(clusterID string, dataPartitionSize uint64) {
var msg string
if excessAddr, excessErr := partition.deleteIllegalReplica(); excessErr != nil {
msg = fmt.Sprintf("action[%v], partitionID:%v Excess Replication On :%v Err:%v rocksDBRecords:%v",
deleteIllegalReplicaErr, partition.PartitionID, excessAddr, excessErr.Error(), partition.Hosts)
Warn(clusterID, msg)
partition.Lock()
partition.removeReplicaByAddr(excessAddr)
partition.Unlock()
}
if partition.Status == proto.ReadWrite {
return
}
if lackAddr, lackErr := partition.missingReplicaAddress(dataPartitionSize); lackErr != nil {
msg = fmt.Sprintf("action[%v], partitionID:%v Lack Replication On :%v Err:%v Hosts:%v new task to create DataReplica",
addMissingReplicaErr, partition.PartitionID, lackAddr, lackErr.Error(), partition.Hosts)
Warn(clusterID, msg)
}
return
}
func (partition *DataPartition) deleteIllegalReplica() (excessAddr string, err error) {
partition.Lock()
defer partition.Unlock()
for i := 0; i < len(partition.Replicas); i++ {
replica := partition.Replicas[i]
if ok := partition.hasHost(replica.Addr); !ok {
excessAddr = replica.Addr
err = proto.ErrIllegalDataReplica
break
}
}
return
}
func (partition *DataPartition) missingReplicaAddress(dataPartitionSize uint64) (addr string, err error) {
partition.Lock()
defer partition.Unlock()
if time.Now().Unix()-partition.createTime < 120 {
return
}
// go through all the hosts to find the missing replica
for _, host := range partition.Hosts {
if _, ok := partition.hasReplica(host); !ok {
log.LogError(fmt.Sprintf("action[missingReplicaAddress],partitionID:%v lack replication:%v",
partition.PartitionID, host))
err = proto.ErrMissingReplica
addr = host
break
}
}
return
}
func (partition *DataPartition) checkReplicaSize(clusterID string, diffSpaceUsage uint64) {
partition.RLock()
defer partition.RUnlock()
if len(partition.Replicas) == 0 {
return
}
diff := 0.0
sentry := float64(partition.Replicas[0].Used)
for _, dr := range partition.Replicas {
temp := math.Abs(float64(dr.Used) - sentry)
if temp > diff {
diff = temp
}
}
if diff > float64(diffSpaceUsage) {
msg := fmt.Sprintf("action[checkReplicaSize] vol[%v],partition[%v] difference space usage [%v] larger than %v, ",
partition.VolName, partition.PartitionID, diff, diffSpaceUsage)
for _, dr := range partition.Replicas {
msg = msg + fmt.Sprintf("replica[%v],used[%v];", dr.Addr, dr.Used)
}
Warn(clusterID, msg)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"runtime"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/compressor"
"github.com/cubefs/cubefs/util/log"
)
// DataPartitionMap stores all the data partitionMap
type DataPartitionMap struct {
sync.RWMutex
partitionMap map[uint64]*DataPartition
readableAndWritableCnt int // number of readable and writable partitionMap
lastLoadedIndex uint64 // last loaded partition index
lastReleasedIndex uint64 // last released partition index
partitions []*DataPartition
responseCache []byte
responseCompressCache []byte
lastAutoCreateTime time.Time
volName string
readMutex sync.RWMutex
}
func newDataPartitionMap(volName string) (dpMap *DataPartitionMap) {
dpMap = new(DataPartitionMap)
dpMap.partitionMap = make(map[uint64]*DataPartition, 0)
dpMap.partitions = make([]*DataPartition, 0)
dpMap.responseCache = make([]byte, 0)
dpMap.responseCompressCache = make([]byte, 0)
dpMap.volName = volName
dpMap.lastAutoCreateTime = time.Now()
return
}
// attention: it's not deep clone for element, dataPartition
func (dpMap *DataPartitionMap) clonePartitions() []*DataPartition {
dpMap.RLock()
defer dpMap.RUnlock()
partitions := make([]*DataPartition, 0)
for _, dp := range dpMap.partitions {
partitions = append(partitions, dp)
}
return partitions
}
func (dpMap *DataPartitionMap) get(ID uint64) (*DataPartition, error) {
dpMap.RLock()
defer dpMap.RUnlock()
if v, ok := dpMap.partitionMap[ID]; ok {
return v, nil
}
return nil, proto.ErrDataPartitionNotExists
}
func (dpMap *DataPartitionMap) del(dp *DataPartition) {
dpMap.Lock()
defer dpMap.Unlock()
_, ok := dpMap.partitionMap[dp.PartitionID]
if !ok {
return
}
dataPartitions := make([]*DataPartition, 0)
for index, partition := range dpMap.partitions {
if partition.PartitionID == dp.PartitionID {
dataPartitions = append(dataPartitions, dpMap.partitions[:index]...)
dataPartitions = append(dataPartitions, dpMap.partitions[index+1:]...)
dpMap.partitions = dataPartitions
break
}
}
delete(dpMap.partitionMap, dp.PartitionID)
}
func (dpMap *DataPartitionMap) put(dp *DataPartition) {
dpMap.Lock()
defer dpMap.Unlock()
_, ok := dpMap.partitionMap[dp.PartitionID]
if !ok {
dpMap.partitions = append(dpMap.partitions, dp)
dpMap.partitionMap[dp.PartitionID] = dp
return
}
// replace the old partition with dp in the map and array
dpMap.partitionMap[dp.PartitionID] = dp
dataPartitions := make([]*DataPartition, 0)
for index, partition := range dpMap.partitions {
if partition.PartitionID == dp.PartitionID {
dataPartitions = append(dataPartitions, dpMap.partitions[:index]...)
dataPartitions = append(dataPartitions, dp)
dataPartitions = append(dataPartitions, dpMap.partitions[index+1:]...)
dpMap.partitions = dataPartitions
break
}
}
}
func (dpMap *DataPartitionMap) setReadWriteDataPartitions(readWrites int, clusterName string) {
dpMap.Lock()
defer dpMap.Unlock()
dpMap.readableAndWritableCnt = readWrites
}
func (dpMap *DataPartitionMap) getDataPartitionResponseCache() []byte {
dpMap.RLock()
defer dpMap.RUnlock()
return dpMap.responseCache
}
func (dpMap *DataPartitionMap) getDataPartitionCompressCache() []byte {
dpMap.RLock()
defer dpMap.RUnlock()
return dpMap.responseCompressCache
}
func (dpMap *DataPartitionMap) setDataPartitionResponseCache(responseCache []byte) {
dpMap.Lock()
defer dpMap.Unlock()
if responseCache != nil {
dpMap.responseCache = responseCache
}
}
func (dpMap *DataPartitionMap) setDataPartitionCompressCache(responseCompress []byte) {
dpMap.Lock()
defer dpMap.Unlock()
if responseCompress != nil {
dpMap.responseCompressCache = responseCompress
}
}
func (dpMap *DataPartitionMap) updateResponseCache(needsUpdate bool, minPartitionID uint64, volType int) (body []byte, err error) {
responseCache := dpMap.getDataPartitionResponseCache()
if responseCache == nil || needsUpdate || len(responseCache) == 0 {
dpMap.readMutex.Lock()
defer dpMap.readMutex.Unlock()
responseCache = dpMap.getDataPartitionResponseCache()
if !(responseCache == nil || needsUpdate || len(responseCache) == 0) {
body = responseCache
return
}
dpResps := dpMap.getDataPartitionsView(minPartitionID)
if len(dpResps) == 0 && proto.IsHot(volType) {
log.LogError(fmt.Sprintf("action[updateDpResponseCache],volName[%v] minPartitionID:%v,err:%v",
dpMap.volName, minPartitionID, proto.ErrNoAvailDataPartition))
return nil, proto.ErrNoAvailDataPartition
}
cv := proto.NewDataPartitionsView()
cv.DataPartitions = dpResps
reply := newSuccessHTTPReply(cv)
if body, err = json.Marshal(reply); err != nil {
log.LogError(fmt.Sprintf("action[updateDpResponseCache],minPartitionID:%v,err:%v",
minPartitionID, err.Error()))
return nil, proto.ErrMarshalData
}
dpMap.setDataPartitionResponseCache(body)
return
}
body = responseCache
return
}
func (dpMap *DataPartitionMap) updateCompressCache(needsUpdate bool, minPartitionID uint64, volType int) (body []byte, err error) {
body = dpMap.getDataPartitionCompressCache()
if len(body) > 0 {
return
}
if body, err = dpMap.updateResponseCache(needsUpdate, minPartitionID, volType); err != nil {
log.LogErrorf("action[updateCompressCache]updateResponseCache failed,err:%+v", err)
return
}
if body, err = compressor.New(compressor.EncodingGzip).Compress(body); err != nil {
log.LogErrorf("action[updateCompressCache]GzipCompressor.Compress failed,err:%+v", err)
err = proto.ErrCompressFailed
return
}
dpMap.setDataPartitionCompressCache(body)
return
}
func (dpMap *DataPartitionMap) getDataPartitionsView(minPartitionID uint64) (dpResps []*proto.DataPartitionResponse) {
dpResps = make([]*proto.DataPartitionResponse, 0)
log.LogDebugf("volName[%v] DataPartitionMapLen[%v],DataPartitionsLen[%v],minPartitionID[%v]",
dpMap.volName, len(dpMap.partitionMap), len(dpMap.partitions), minPartitionID)
dpMap.RLock()
defer dpMap.RUnlock()
for _, dp := range dpMap.partitionMap {
if len(dp.Hosts) == 0 {
log.LogErrorf("getDataPartitionsView. dp %v host nil", dp.PartitionID)
continue
}
if dp.PartitionID <= minPartitionID {
continue
}
dpResp := dp.convertToDataPartitionResponse()
dpResps = append(dpResps, dpResp)
}
return
}
func (dpMap *DataPartitionMap) getDataPartitionsToBeReleased(numberOfDataPartitionsToFree int, secondsToFreeDataPartitionAfterLoad int64) (partitions []*DataPartition, startIndex uint64) {
partitions = make([]*DataPartition, 0)
dpMap.RLock()
defer dpMap.RUnlock()
dpLen := len(dpMap.partitions)
if dpLen == 0 {
return
}
startIndex = dpMap.lastReleasedIndex
count := numberOfDataPartitionsToFree
if dpLen < numberOfDataPartitionsToFree {
count = dpLen
}
for i := 0; i < count; i++ {
if dpMap.lastReleasedIndex >= uint64(dpLen) {
dpMap.lastReleasedIndex = 0
}
dp := dpMap.partitions[dpMap.lastReleasedIndex]
dpMap.lastReleasedIndex++
if time.Now().Unix()-dp.LastLoadedTime >= secondsToFreeDataPartitionAfterLoad {
partitions = append(partitions, dp)
}
}
return
}
func (dpMap *DataPartitionMap) freeMemOccupiedByDataPartitions(partitions []*DataPartition) {
var wg sync.WaitGroup
for _, dp := range partitions {
wg.Add(1)
go func(dp *DataPartition) {
defer func() {
wg.Done()
if err := recover(); err != nil {
const size = runtimeStackBufSize
buf := make([]byte, size)
buf = buf[:runtime.Stack(buf, false)]
log.LogError(fmt.Sprintf("[%v] freeMemOccupiedByDataPartitions panic %v: %s\n", dpMap.volName, err, buf))
}
}()
dp.releaseDataPartition()
}(dp)
}
wg.Wait()
}
func (dpMap *DataPartitionMap) getDataPartitionsToBeChecked(loadFrequencyTime int64) (partitions []*DataPartition, startIndex uint64) {
partitions = make([]*DataPartition, 0)
dpMap.RLock()
defer dpMap.RUnlock()
dpLen := len(dpMap.partitions)
if dpLen == 0 {
return
}
startIndex = dpMap.lastLoadedIndex
// determine the number of data partitions to load
count := dpLen / intervalToLoadDataPartition
if count == 0 {
count = 1
}
for i := 0; i < count; i++ {
if dpMap.lastLoadedIndex >= (uint64)(len(dpMap.partitions)) {
dpMap.lastLoadedIndex = 0
}
dp := dpMap.partitions[dpMap.lastLoadedIndex]
dpMap.lastLoadedIndex++
if time.Now().Unix()-dp.LastLoadedTime >= loadFrequencyTime {
partitions = append(partitions, dp)
}
}
return
}
func (dpMap *DataPartitionMap) totalUsedSpace() (totalUsed uint64) {
dpMap.RLock()
defer dpMap.RUnlock()
for _, dp := range dpMap.partitions {
totalUsed = totalUsed + dp.getMaxUsedSpace()
}
return
}
func (dpMap *DataPartitionMap) setAllDataPartitionsToReadOnly() {
dpMap.Lock()
defer dpMap.Unlock()
changedCnt := 0
for _, dp := range dpMap.partitions {
if proto.ReadWrite == dp.Status {
dp.Status = proto.ReadOnly
changedCnt++
}
}
log.LogDebugf("action[setAllDataPartitionsToReadOnly] ReadWrite->ReadOnly dp cnt: %v", changedCnt)
}
func (dpMap *DataPartitionMap) checkBadDiskDataPartitions(diskPath, nodeAddr string) (partitions []*DataPartition) {
dpMap.RLock()
defer dpMap.RUnlock()
partitions = make([]*DataPartition, 0)
for _, dp := range dpMap.partitionMap {
if dp.containsBadDisk(diskPath, nodeAddr) {
partitions = append(partitions, dp)
}
}
return
}
func (dpMap *DataPartitionMap) getReplicaDiskPaths(nodeAddr string) (diskPaths []string) {
dpMap.RLock()
defer dpMap.RUnlock()
diskPaths = make([]string, 0)
for _, dp := range dpMap.partitionMap {
disk := dp.getReplicaDisk(nodeAddr)
if len(disk) != 0 && !inStingList(disk, diskPaths) {
diskPaths = append(diskPaths, disk)
}
}
return
}
func inStingList(target string, strArray []string) bool {
for _, element := range strArray {
if target == element {
return true
}
}
return false
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
// DataReplica represents the replica of a data partition
type DataReplica struct {
proto.DataReplica
dataNode *DataNode
loc uint8
}
func newDataReplica(dataNode *DataNode) (replica *DataReplica) {
replica = new(DataReplica)
replica.dataNode = dataNode
replica.Addr = dataNode.Addr
replica.ReportTime = time.Now().Unix()
return
}
func (replica *DataReplica) setAlive() {
replica.ReportTime = time.Now().Unix()
}
func (replica *DataReplica) isMissing(interval int64) (isMissing bool) {
if time.Now().Unix()-replica.ReportTime > interval {
isMissing = true
}
return
}
func (replica *DataReplica) isLive(timeOutSec int64) (isAvailable bool) {
log.LogDebugf("action[isLive] replica addr %v, datanode active %v replica status %v and is active %v",
replica.Addr, replica.dataNode.isActive, replica.Status, replica.isActive(timeOutSec))
if replica.dataNode.isActive && replica.Status != proto.Unavailable &&
replica.isActive(timeOutSec) {
isAvailable = true
}
return
}
func (replica *DataReplica) isActive(timeOutSec int64) bool {
return time.Now().Unix()-replica.ReportTime <= timeOutSec
}
func (replica *DataReplica) getReplicaNode() (node *DataNode) {
return replica.dataNode
}
// check if the replica's location is available
func (replica *DataReplica) isLocationAvailable() (isAvailable bool) {
dataNode := replica.getReplicaNode()
dataNode.Lock()
defer dataNode.Unlock()
if dataNode.isActive == true && replica.isActive(defaultDataPartitionTimeOutSec) == true {
isAvailable = true
}
return
}
func (replica *DataReplica) isRepairing() bool {
return replica.Status == proto.Recovering
}
func (replica *DataReplica) isUnavailable() bool {
return replica.Status == proto.Unavailable
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/util/log"
)
func (c *Cluster) scheduleToCheckDiskRecoveryProgress() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
if c.vols != nil {
c.checkDiskRecoveryProgress()
}
}
time.Sleep(time.Second * defaultIntervalToCheckDataPartition)
}
}()
}
func (c *Cluster) checkDiskRecoveryProgress() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkDiskRecoveryProgress occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkDiskRecoveryProgress occurred panic")
}
}()
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
c.BadDataPartitionIds.Range(func(key, value interface{}) bool {
badDataPartitionIds := value.([]uint64)
newBadDpIds := make([]uint64, 0)
for _, partitionID := range badDataPartitionIds {
partition, err := c.getDataPartitionByID(partitionID)
if err != nil {
Warn(c.Name, fmt.Sprintf("checkDiskRecoveryProgress clusterID[%v],partitionID[%v] is not exist", c.Name, partitionID))
continue
}
// do not update status if paused
if partition.IsDecommissionPaused() {
continue
}
_, err = c.getVol(partition.VolName)
if err != nil {
Warn(c.Name, fmt.Sprintf("checkDiskRecoveryProgress clusterID[%v],partitionID[%v] vol(%s) is not exist",
c.Name, partitionID, partition.VolName))
continue
}
log.LogInfof("action[checkDiskRecoveryProgress] dp %v isSpec %v replicas %v conf replicas num %v",
partition.PartitionID, partition.isSpecialReplicaCnt(), len(partition.Replicas), int(partition.ReplicaNum))
if len(partition.Replicas) == 0 {
partition.SetDecommissionStatus(DecommissionSuccess)
log.LogWarnf("action[checkDiskRecoveryProgress] dp %v maybe deleted", partition.PartitionID)
continue
}
//if len(partition.Replicas) == 0 ||
// (!partition.isSpecialReplicaCnt() && len(partition.Replicas) < int(partition.ReplicaNum)) ||
// (partition.isSpecialReplicaCnt() && len(partition.Replicas) > int(partition.ReplicaNum)) {
// newBadDpIds = append(newBadDpIds, partitionID)
// log.LogInfof("action[checkDiskRecoveryProgress] dp %v newBadDpIds [%v] replics %v conf replics num %v",
// partition.PartitionID, newBadDpIds, len(partition.Replicas), int(partition.ReplicaNum))
// continue
//}
newReplica, _ := partition.getReplica(partition.DecommissionDstAddr)
if newReplica == nil {
log.LogWarnf("action[checkDiskRecoveryProgress] dp %v cannot find replica %v", partition.PartitionID,
partition.DecommissionDstAddr)
partition.DecommissionNeedRollback = true
partition.SetDecommissionStatus(DecommissionFail)
continue
}
if newReplica.isRepairing() {
if !partition.isSpecialReplicaCnt() &&
time.Now().Sub(partition.RecoverStartTime) > c.GetDecommissionDataPartitionRecoverTimeOut() {
partition.DecommissionNeedRollback = true
partition.SetDecommissionStatus(DecommissionFail)
Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],partitionID[%v] recovered timeout %s",
c.Name, partitionID, time.Now().Sub(partition.RecoverStartTime).String()))
} else {
newBadDpIds = append(newBadDpIds, partitionID)
}
} else {
if partition.isSpecialReplicaCnt() {
continue // change dp decommission status in decommission function
}
// do not add to BadDataPartitionIds
if newReplica.isUnavailable() {
partition.DecommissionNeedRollback = true
partition.SetDecommissionStatus(DecommissionFail)
Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],partitionID[%v] has recovered failed", c.Name, partitionID))
} else {
partition.SetDecommissionStatus(DecommissionSuccess) // can be readonly or readwrite
Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],partitionID[%v] has recovered success", c.Name, partitionID))
}
partition.RLock()
c.syncUpdateDataPartition(partition)
partition.RUnlock()
}
}
if len(newBadDpIds) == 0 {
Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],node:disk[%v] has recovered success", c.Name, key))
c.BadDataPartitionIds.Delete(key)
} else {
c.BadDataPartitionIds.Store(key, newBadDpIds)
log.LogInfof("action[checkDiskRecoveryProgress]BadDataPartitionIds key(%s) still have (%d) dp in recover", key, len(newBadDpIds))
}
return true
})
}
func (c *Cluster) addAndSyncDecommissionedDisk(dataNode *DataNode, diskPath string) (err error) {
if exist := dataNode.addDecommissionedDisk(diskPath); exist {
return
}
if err = c.syncUpdateDataNode(dataNode); err != nil {
dataNode.deleteDecommissionedDisk(diskPath)
return
}
log.LogInfof("action[addAndSyncDecommissionedDisk] finish, remaining decommissioned disks[%v], dataNode[%v]", dataNode.getDecommissionedDisks(), dataNode.Addr)
return
}
func (c *Cluster) deleteAndSyncDecommissionedDisk(dataNode *DataNode, diskPath string) (err error) {
if exist := dataNode.deleteDecommissionedDisk(diskPath); !exist {
return
}
if err = c.syncUpdateDataNode(dataNode); err != nil {
dataNode.addDecommissionedDisk(diskPath)
return
}
log.LogInfof("action[deleteAndSyncDecommissionedDisk] finish, remaining decommissioned disks[%v], dataNode[%v]", dataNode.getDecommissionedDisks(), dataNode.Addr)
return
}
func (c *Cluster) decommissionDisk(dataNode *DataNode, raftForce bool, badDiskPath string,
badPartitions []*DataPartition, diskDisable bool) (err error) {
msg := fmt.Sprintf("action[decommissionDisk], Node[%v] OffLine,disk[%v]", dataNode.Addr, badDiskPath)
log.LogWarn(msg)
for _, dp := range badPartitions {
go func(dp *DataPartition) {
if err = c.decommissionDataPartition(dataNode.Addr, dp, raftForce, diskOfflineErr); err != nil {
return
}
}(dp)
}
msg = fmt.Sprintf("action[decommissionDisk],clusterID[%v] node[%v] OffLine success",
c.Name, dataNode.Addr)
Warn(c.Name, msg)
return
}
const (
ManualDecommission uint32 = iota
AutoDecommission
)
type DecommissionDisk struct {
SrcAddr string
DstAddr string
DiskPath string
DecommissionStatus uint32
DecommissionRaftForce bool
DecommissionRetry uint8
DecommissionDpTotal int
DecommissionTerm uint64
DecommissionDpCount int
DiskDisable bool
Type uint32
DecommissionCompleteTime int64
}
func (dd *DecommissionDisk) GenerateKey() string {
return fmt.Sprintf("%s_%s", dd.SrcAddr, dd.DiskPath)
}
func (dd *DecommissionDisk) updateDecommissionStatus(c *Cluster, debug bool) (uint32, float64) {
var (
progress float64
totalNum = dd.DecommissionDpTotal
partitionIds []uint64
failedPartitionIds []uint64
runningPartitionIds []uint64
preparePartitionIds []uint64
stopPartitionIds []uint64
)
if dd.GetDecommissionStatus() == DecommissionInitial {
return DecommissionInitial, float64(0)
}
if dd.GetDecommissionStatus() == markDecommission {
return markDecommission, float64(0)
}
if totalNum == InvalidDecommissionDpCnt && dd.GetDecommissionStatus() == DecommissionFail {
return DecommissionFail, float64(0)
}
if dd.GetDecommissionStatus() == DecommissionSuccess {
return DecommissionSuccess, float64(1)
}
if dd.GetDecommissionStatus() == DecommissionPause {
return DecommissionPause, float64(0)
}
defer func() {
c.syncUpdateDecommissionDisk(dd)
}()
if dd.DecommissionRetry >= defaultDecommissionRetryLimit {
dd.markDecommissionFailed()
return DecommissionFail, float64(0)
}
// Get all dp on this disk
failedNum := 0
runningNum := 0
prepareNum := 0
stopNum := 0
// get the latest decommission result
partitions := c.getAllDecommissionDataPartitionByDiskAndTerm(dd.SrcAddr, dd.DiskPath, dd.DecommissionTerm)
if len(partitions) == 0 {
log.LogDebugf("action[updateDecommissionDiskStatus]no partitions left:%v", dd.GenerateKey())
dd.markDecommissionSuccess()
return DecommissionSuccess, float64(1)
}
for _, dp := range partitions {
if dp.IsDecommissionFailed() && !dp.needRollback(c) {
failedNum++
failedPartitionIds = append(failedPartitionIds, dp.PartitionID)
}
if dp.GetDecommissionStatus() == DecommissionRunning {
runningNum++
runningPartitionIds = append(runningPartitionIds, dp.PartitionID)
}
if dp.GetDecommissionStatus() == DecommissionPrepare {
prepareNum++
preparePartitionIds = append(preparePartitionIds, dp.PartitionID)
}
// disk may stop before and will be counted into partitions
if dp.GetDecommissionStatus() == DecommissionPause {
stopNum++
stopPartitionIds = append(stopPartitionIds, dp.PartitionID)
}
partitionIds = append(partitionIds, dp.PartitionID)
}
progress = float64(totalNum-len(partitions)) / float64(totalNum)
if debug {
log.LogInfof("action[updateDecommissionDiskStatus] disk[%v] progress[%v] totalNum[%v] "+
"partitionIds %v FailedNum[%v] failedPartitionIds %v, runningNum[%v] runningDp %v, prepareNum[%v] prepareDp %v "+
"stopNum[%v] stopPartitionIds %v ",
dd.GenerateKey(), progress, totalNum, partitionIds, failedNum, failedPartitionIds, runningNum, runningPartitionIds,
prepareNum, preparePartitionIds, stopNum, stopPartitionIds)
}
if failedNum >= (len(partitions)-stopNum) && failedNum != 0 {
dd.markDecommissionFailed()
return DecommissionFail, progress
}
dd.SetDecommissionStatus(DecommissionRunning)
return DecommissionRunning, progress
}
func (dd *DecommissionDisk) GetDecommissionStatus() uint32 {
return atomic.LoadUint32(&dd.DecommissionStatus)
}
func (dd *DecommissionDisk) SetDecommissionStatus(status uint32) {
atomic.StoreUint32(&dd.DecommissionStatus, status)
}
func (dd *DecommissionDisk) markDecommissionSuccess() {
dd.SetDecommissionStatus(DecommissionSuccess)
dd.DecommissionCompleteTime = time.Now().Unix()
}
func (dd *DecommissionDisk) markDecommissionFailed() {
dd.SetDecommissionStatus(DecommissionFail)
dd.DecommissionCompleteTime = time.Now().Unix()
}
func (dd *DecommissionDisk) GetLatestDecommissionDP(c *Cluster) (partitions []*DataPartition) {
partitions = c.getAllDecommissionDataPartitionByDiskAndTerm(dd.SrcAddr, dd.DiskPath, dd.DecommissionTerm)
return
}
func (dd *DecommissionDisk) GetDecommissionFailedDP(c *Cluster) (error, []uint64) {
var (
failedDps []uint64
err error
badPartitions []*DataPartition
)
if dd.GetDecommissionStatus() != DecommissionFail {
err = fmt.Errorf("action[GetDecommissionDiskFailedDP]dataNode[%s] disk[%s] status must be failed,but[%d]",
dd.SrcAddr, dd.DiskPath, dd.GetDecommissionStatus())
return err, failedDps
}
badPartitions = c.getAllDecommissionDataPartitionByDisk(dd.SrcAddr, dd.DiskPath)
for _, dp := range badPartitions {
if dp.IsDecommissionFailed() {
failedDps = append(failedDps, dp.PartitionID)
}
}
log.LogWarnf("action[GetDecommissionDiskFailedDP] failed dp list [%v]", failedDps)
return nil, failedDps
}
func (dd *DecommissionDisk) markDecommission(dstPath string, raftForce bool, limit int) {
// if transfer from pause,do not change these attrs
if dd.GetDecommissionStatus() != DecommissionPause {
dd.DecommissionDpTotal = InvalidDecommissionDpCnt
dd.DecommissionDpCount = limit
dd.DecommissionRaftForce = raftForce
dd.DstAddr = dstPath
dd.DecommissionRetry = 0
}
dd.DecommissionTerm = dd.DecommissionTerm + 1
dd.SetDecommissionStatus(markDecommission)
}
func (dd *DecommissionDisk) canAddToDecommissionList() bool {
status := dd.GetDecommissionStatus()
if status == DecommissionRunning ||
status == markDecommission {
return true
}
return false
}
func (dd *DecommissionDisk) AddToNodeSet() bool {
status := dd.GetDecommissionStatus()
if status == DecommissionRunning ||
status == markDecommission {
return true
}
return false
}
func (dd *DecommissionDisk) IsManualDecommissionDisk() bool {
return dd.Type == ManualDecommission
}
func (dd *DecommissionDisk) CanBePaused() bool {
status := dd.GetDecommissionStatus()
if status == DecommissionRunning || status == markDecommission ||
status == DecommissionPause {
return true
}
return false
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import "sync/atomic"
type DpCountLimiter struct {
cntLimit *uint64
}
func newDpCountLimiter(cntLimit *uint64) DpCountLimiter {
limiter := DpCountLimiter{
cntLimit: cntLimit,
}
return limiter
}
func (cntLimiter *DpCountLimiter) GetCntLimit() uint64 {
limit := uint64(0)
if cntLimiter.cntLimit != nil {
limit = atomic.LoadUint64(cntLimiter.cntLimit)
}
if limit == 0 {
limit = defaultMaxDpCntLimit
}
return limit
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"sort"
"strconv"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util/log"
)
// Recover a file if it has bad CRC or it has been timed out before.
func (partition *DataPartition) validateCRC(clusterID string) {
partition.Lock()
defer partition.Unlock()
liveReplicas := partition.liveReplicas(defaultDataPartitionTimeOutSec)
if len(liveReplicas) == 0 {
return
}
if len(liveReplicas) < int(partition.ReplicaNum) {
liveAddrs := make([]string, 0)
for _, replica := range liveReplicas {
liveAddrs = append(liveAddrs, replica.Addr)
}
inactiveAddrs := make([]string, 0)
for _, host := range partition.Hosts {
if !contains(liveAddrs, host) {
inactiveAddrs = append(inactiveAddrs, host)
}
}
Warn(clusterID, fmt.Sprintf("vol[%v],dpId[%v],liveAddrs[%v],inactiveAddrs[%v]", partition.VolName, partition.PartitionID, liveAddrs, inactiveAddrs))
}
partition.doValidateCRC(liveReplicas, clusterID)
return
}
func (partition *DataPartition) doValidateCRC(liveReplicas []*DataReplica, clusterID string) {
if !proto.IsNormalDp(partition.PartitionType) {
return
}
for _, fc := range partition.FileInCoreMap {
extentID, err := strconv.ParseUint(fc.Name, 10, 64)
if err != nil {
continue
}
infoFunc := func() string {
return fmt.Sprintf("partition[%v] extentID %v, isTiny %v", partition.PartitionID, extentID, storage.IsTinyExtent(extentID))
}
if storage.IsTinyExtent(extentID) {
partition.checkTinyExtentFile(fc, liveReplicas, clusterID, infoFunc)
} else {
partition.checkExtentFile(fc, liveReplicas, clusterID, infoFunc)
}
}
}
func (partition *DataPartition) checkTinyExtentFile(fc *FileInCore, liveReplicas []*DataReplica, clusterID string, getInfoCallback func() string) {
if fc.shouldCheckCrc() == false {
return
}
fms, needRepair := fc.needCrcRepair(liveReplicas, getInfoCallback)
if !needRepair {
return
}
if !hasSameSize(fms) {
msg := fmt.Sprintf("CheckFileError size not match,cluster[%v],dpID[%v],", clusterID, partition.PartitionID)
for _, fm := range fms {
msg = msg + fmt.Sprintf("fm[%v]:size[%v]\n", fm.locIndex, fm.Size)
}
log.LogWarn(msg)
return
}
msg := fmt.Sprintf("CheckFileError crc not match,cluster[%v],dpID[%v]", clusterID, partition.PartitionID)
for _, fm := range fms {
msg = msg + fmt.Sprintf("fm[%v]:%v\n", fm.locIndex, fm)
}
Warn(clusterID, msg)
return
}
func (partition *DataPartition) checkExtentFile(fc *FileInCore, liveReplicas []*DataReplica, clusterID string, getInfoCallback func() string) {
if fc.shouldCheckCrc() == false {
return
}
fms, needRepair := fc.needCrcRepair(liveReplicas, getInfoCallback)
if !hasSameSize(fms) {
msg := fmt.Sprintf("CheckFileError size not match,cluster[%v],dpID[%v],", clusterID, partition.PartitionID)
for _, fm := range fms {
msg = msg + fmt.Sprintf("fm[%v]:size[%v]\n", fm.locIndex, fm.Size)
}
log.LogWarn(msg)
return
}
if len(fms) < len(liveReplicas) && (time.Now().Unix()-fc.LastModify) > intervalToCheckMissingReplica {
lastReportTime, ok := partition.FilesWithMissingReplica[fc.Name]
if len(partition.FilesWithMissingReplica) > 400 {
Warn(clusterID, fmt.Sprintf("partitionid[%v] has [%v] files missed replica", partition.PartitionID, len(partition.FilesWithMissingReplica)))
return
}
if !ok {
partition.FilesWithMissingReplica[fc.Name] = time.Now().Unix()
return
}
if time.Now().Unix()-lastReportTime < intervalToCheckMissingReplica {
return
}
liveAddrs := make([]string, 0)
for _, replica := range liveReplicas {
liveAddrs = append(liveAddrs, replica.Addr)
}
Warn(clusterID, fmt.Sprintf("partitionid[%v],file[%v],fms[%v],liveAddr[%v]", partition.PartitionID, fc.Name, fc.getFileMetaAddrs(), liveAddrs))
}
if !needRepair {
log.LogDebugf("checkExtentFile. partition %v all equal so no need compare in details", partition.PartitionID)
return
}
fileCrcArr := fc.calculateCrc(fms)
sort.Sort(fileCrcSorter(fileCrcArr))
maxCountFileCrcIndex := len(fileCrcArr) - 1
if fileCrcArr[maxCountFileCrcIndex].count == 1 {
msg := fmt.Sprintf("checkFileCrcTaskErr clusterID[%v] partitionID:%v File:%v ExtentOffset different between all node "+
" it can not repair it ", clusterID, partition.PartitionID, fc.Name)
msg += (fileCrcSorter)(fileCrcArr).log()
Warn(clusterID, msg)
return
}
for index, crc := range fileCrcArr {
if index != maxCountFileCrcIndex {
badNode := crc.meta
msg := fmt.Sprintf("checkFileCrcTaskErr clusterID[%v] partitionID:%v File:%v badCrc On :%v ",
clusterID, partition.PartitionID, fc.Name, badNode.getLocationAddr())
msg += (fileCrcSorter)(fileCrcArr).log()
Warn(clusterID, msg)
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"time"
"github.com/cubefs/cubefs/util/log"
)
// FileCrc defines the crc of a file
type FileCrc struct {
crc uint32
count int
meta *FileMetadata
}
func newFileCrc(volCrc uint32) (fc *FileCrc) {
fc = new(FileCrc)
fc.crc = volCrc
fc.count = 1
return
}
type fileCrcSorter []*FileCrc
func (fileCrcArr fileCrcSorter) Less(i, j int) bool {
return fileCrcArr[i].count < fileCrcArr[j].count
}
func (fileCrcArr fileCrcSorter) Swap(i, j int) {
fileCrcArr[i], fileCrcArr[j] = fileCrcArr[j], fileCrcArr[i]
}
func (fileCrcArr fileCrcSorter) Len() (length int) {
length = len(fileCrcArr)
return
}
func (fileCrcArr fileCrcSorter) log() (msg string) {
for _, fileCrc := range fileCrcArr {
addr := fileCrc.meta.getLocationAddr()
count := fileCrc.count
crc := fileCrc.crc
msg = fmt.Sprintf(msg+" addr:%v count:%v crc:%v ", addr, count, crc)
}
return
}
func (fc *FileInCore) shouldCheckCrc() bool {
return time.Now().Unix()-fc.LastModify > defaultIntervalToCheckCrc
}
func (fc *FileInCore) needCrcRepair(liveReplicas []*DataReplica, getInfoCallback func() string) (fms []*FileMetadata, needRepair bool) {
var baseCrc uint32
fms = make([]*FileMetadata, 0)
for i := 0; i < len(liveReplicas); i++ {
vol := liveReplicas[i]
if fm, ok := fc.getFileMetaByAddr(vol); ok {
fms = append(fms, fm)
}
}
if len(fms) == 0 {
return
}
baseCrc = fms[0].Crc
baseApplyId := fms[0].ApplyID
for _, fm := range fms {
if fm.getFileCrc() == EmptyCrcValue || fm.getFileCrc() == 0 {
needRepair = false
return
}
if fm.ApplyID == baseApplyId && fm.getFileCrc() != baseCrc {
log.LogErrorf("needCrcRepair. getInfoCallback %v, extent %v, applyID(%v:%v), crc %v",
getInfoCallback(), fc.Name, fm.ApplyID, baseApplyId, baseCrc)
needRepair = true
return
}
}
return
}
func hasSameSize(fms []*FileMetadata) (same bool) {
sentry := fms[0].Size
for _, fm := range fms {
if fm.Size != sentry {
return
}
}
return true
}
func (fc *FileInCore) calculateCrc(badVfNodes []*FileMetadata) (fileCrcArr []*FileCrc) {
badLen := len(badVfNodes)
fileCrcArr = make([]*FileCrc, 0)
for i := 0; i < badLen; i++ {
crcKey := badVfNodes[i].getFileCrc()
isFound := false
var crc *FileCrc
for _, crc = range fileCrcArr {
if crc.crc == crcKey {
isFound = true
break
}
}
if isFound == false {
crc = newFileCrc(crcKey)
crc.meta = badVfNodes[i]
fileCrcArr = append(fileCrcArr, crc)
} else {
crc.count++
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"github.com/cubefs/cubefs/proto"
)
// FileMetadata defines the file metadata on a dataNode
type FileMetadata struct {
proto.FileMetadata
locIndex uint8
ApplyID uint64
}
func (fm *FileMetadata) String() (msg string) {
msg = fmt.Sprintf("Crc[%v] LocAddr[%v] locIndex[%v] Size[%v]",
fm.Crc, fm.LocAddr, fm.locIndex, fm.Size)
return
}
func (fm *FileMetadata) getLocationAddr() (loc string) {
return fm.LocAddr
}
func (fm *FileMetadata) getFileCrc() (crc uint32) {
return fm.Crc
}
// FileInCore define file in data partition
type FileInCore struct {
proto.FileInCore
MetadataArray []*FileMetadata
}
func newFileMetadata(volCrc uint32, volLoc string, volLocIndex int, size uint32, applyId uint64) (fm *FileMetadata) {
fm = new(FileMetadata)
fm.Crc = volCrc
fm.LocAddr = volLoc
fm.locIndex = uint8(volLocIndex)
fm.Size = size
fm.ApplyID = applyId
return
}
func newFileInCore(name string) (fc *FileInCore) {
fc = new(FileInCore)
fc.Name = name
fc.MetadataArray = make([]*FileMetadata, 0)
return
}
func (fc FileInCore) clone() *proto.FileInCore {
metadataArray := make([]*proto.FileMetadata, len(fc.MetadataArray))
for i, metadata := range fc.MetadataArray {
metadataArray[i] = &proto.FileMetadata{
Crc: metadata.Crc,
LocAddr: metadata.LocAddr,
Size: metadata.Size,
}
}
return &proto.FileInCore{
Name: fc.Name,
LastModify: fc.LastModify,
MetadataArray: metadataArray,
}
}
// Use the File and the volume Location for update.
func (fc *FileInCore) updateFileInCore(volID uint64, vf *proto.File, volLoc *DataReplica, volLocIndex int) {
if vf.Modified > fc.LastModify {
fc.LastModify = vf.Modified
}
isFind := false
for i := 0; i < len(fc.MetadataArray); i++ {
if fc.MetadataArray[i].getLocationAddr() == volLoc.Addr {
fc.MetadataArray[i].Crc = vf.Crc
fc.MetadataArray[i].Size = vf.Size
fc.MetadataArray[i].ApplyID = vf.ApplyID
isFind = true
break
}
}
if isFind == false {
fm := newFileMetadata(vf.Crc, volLoc.Addr, volLocIndex, vf.Size, vf.ApplyID)
fc.MetadataArray = append(fc.MetadataArray, fm)
}
}
func (fc *FileInCore) getFileMetaByAddr(replica *DataReplica) (fm *FileMetadata, ok bool) {
for i := 0; i < len(fc.MetadataArray); i++ {
fm = fc.MetadataArray[i]
if fm.LocAddr == replica.Addr {
ok = true
return
}
}
return
}
func (fc *FileInCore) getFileMetaAddrs() (addrs []string) {
addrs = make([]string, 0)
if len(fc.MetadataArray) == 0 {
return
}
for _, fm := range fc.MetadataArray {
addrs = append(addrs, fm.LocAddr)
}
return
}
//go:build gofuzz
// +build gofuzz
// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package master
import (
fuzz "github.com/AdaLogics/go-fuzz-headers"
)
type MetaNodeParam struct {
Addr string
ZoneName string
ClusterID string
}
func FuzzCreateVol(data []byte) int {
f := fuzz.NewConsumer(data)
vv := volValue{}
err := f.GenerateStruct(&vv)
if err != nil {
return 0
}
vol := newVol(vv)
if vol == nil {
return 0
}
return 1
}
func FuzzNewMetaNode(data []byte) int {
f := fuzz.NewConsumer(data)
param := MetaNodeParam{}
err := f.GenerateStruct(¶m)
if err != nil {
return 0
}
node := newMetaNode(param.Addr, param.ZoneName, param.ClusterID)
if node == nil {
return 0
}
return 1
}
package master
import (
"bufio"
"context"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
"github.com/samsarahq/thunder/graphql"
"github.com/samsarahq/thunder/graphql/schemabuilder"
)
type ClusterService struct {
cluster *Cluster
user *User
conf *clusterConfig
leaderInfo *LeaderInfo
}
func (s *ClusterService) Schema() *graphql.Schema {
schema := schemabuilder.NewSchema()
s.registerObject(schema)
s.registerQuery(schema)
s.registerMutation(schema)
return schema.MustBuild()
}
func (s *ClusterService) registerObject(schema *schemabuilder.Schema) {
object := schema.Object("ClusterView", proto.ClusterView{})
object.FieldFunc("serverCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(s.cluster.dataNodeCount() + s.cluster.metaNodeCount()), nil
})
object.FieldFunc("dataPartitionCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(s.cluster.getDataPartitionCount()), nil
})
object.FieldFunc("metaPartitionCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(s.cluster.getMetaPartitionCount()), nil
})
object.FieldFunc("volumeCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(len(s.cluster.vols)), nil
})
object.FieldFunc("masterCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(len(s.conf.peerAddrs)), nil
})
object.FieldFunc("metaNodeCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(s.cluster.metaNodeCount()), nil
})
object.FieldFunc("dataNodeCount", func(ctx context.Context, args struct{}) (int32, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return 0, err
}
return int32(s.cluster.dataNodeCount()), nil
})
nv := schema.Object("NodeView", proto.NodeView{})
nv.FieldFunc("toMetaNode", func(ctx context.Context, n *proto.NodeView) (*MetaNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
return s.cluster.metaNode(n.Addr)
})
nv.FieldFunc("toDataNode", func(ctx context.Context, n *proto.NodeView) (*DataNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
return s.cluster.dataNode(n.Addr)
})
nv.FieldFunc("reportDisks", func(ctx context.Context, n *proto.NodeView) ([]string, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
node, err := s.cluster.dataNode(n.Addr)
if err != nil {
return nil, err
}
diskmap := make(map[string]bool)
for _, p := range node.DataPartitionReports {
diskmap[p.DiskPath] = true
}
keys := make([]string, 0, len(diskmap))
for key := range diskmap {
keys = append(keys, key)
}
sort.Slice(keys, func(i, j int) bool {
return strings.Compare(keys[i], keys[j]) > 0
})
return keys, nil
})
vs := schema.Object("VolStatInfo", proto.VolStatInfo{})
vs.FieldFunc("toVolume", func(ctx context.Context, n *proto.VolStatInfo) (*Vol, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
return s.cluster.getVol(n.Name)
})
object = schema.Object("DataNode", DataNode{})
object.FieldFunc("isActive", func(ctx context.Context, n *DataNode) bool {
return n.isActive
})
object = schema.Object("metaNode", MetaNode{})
object.FieldFunc("metaPartitionInfos", func(ctx context.Context, n *MetaNode) []*proto.MetaPartitionReport {
return n.metaPartitionInfos
})
}
func (s *ClusterService) registerQuery(schema *schemabuilder.Schema) {
query := schema.Query()
query.FieldFunc("clusterView", s.clusterView)
query.FieldFunc("dataNodeList", s.dataNodeList)
query.FieldFunc("dataNodeListTest", s.dataNodeListTest)
query.FieldFunc("dataNodeGet", s.dataNodeGet)
query.FieldFunc("metaNodeList", s.metaNodeList)
query.FieldFunc("metaNodeGet", s.metaNodeGet)
query.FieldFunc("masterList", s.masterList)
query.FieldFunc("getTopology", s.getTopology)
query.FieldFunc("alarmList", s.alarmList)
}
func (s *ClusterService) registerMutation(schema *schemabuilder.Schema) {
mutation := schema.Mutation()
mutation.FieldFunc("clusterFreeze", s.clusterFreeze)
mutation.FieldFunc("addRaftNode", s.addRaftNode)
mutation.FieldFunc("removeRaftNode", s.removeRaftNode)
mutation.FieldFunc("addMetaNode", s.removeRaftNode)
mutation.FieldFunc("loadMetaPartition", s.loadMetaPartition)
mutation.FieldFunc("decommissionMetaPartition", s.decommissionMetaPartition)
mutation.FieldFunc("decommissionMetaNode", s.decommissionMetaNode)
mutation.FieldFunc("decommissionDisk", s.decommissionDisk)
mutation.FieldFunc("decommissionDataNode", s.decommissionDataNode)
}
// Decommission a disk. This will decommission all the data partitions on this disk.
func (m *ClusterService) decommissionDisk(ctx context.Context, args struct {
OffLineAddr string
DiskPath string
}) (*proto.GeneralResp, error,
) {
node, err := m.cluster.dataNode(args.OffLineAddr)
if err != nil {
return nil, err
}
badPartitions := node.badPartitions(args.DiskPath, m.cluster)
if len(badPartitions) == 0 {
err = fmt.Errorf("node[%v] disk[%v] does not have any data partition", node.Addr, args.DiskPath)
return nil, err
}
var badPartitionIds []uint64
for _, bdp := range badPartitions {
badPartitionIds = append(badPartitionIds, bdp.PartitionID)
}
rstMsg := fmt.Sprintf("receive decommissionDisk node[%v] disk[%v], badPartitionIds[%v] has offline successfully",
node.Addr, args.DiskPath, badPartitionIds)
if err = m.cluster.decommissionDisk(node, false, args.DiskPath, badPartitions, true); err != nil {
return nil, err
}
Warn(m.cluster.Name, rstMsg)
return proto.Success("success"), nil
}
// Decommission a data node. This will decommission all the data partition on that node.
func (m *ClusterService) decommissionDataNode(ctx context.Context, args struct {
OffLineAddr string
}) (*proto.GeneralResp, error,
) {
node, err := m.cluster.dataNode(args.OffLineAddr)
if err != nil {
return nil, err
}
if err := m.cluster.decommissionDataNode(node, false); err != nil {
return nil, err
}
rstMsg := fmt.Sprintf("decommission data node [%v] submited,please check laster!", args.OffLineAddr)
return proto.Success(rstMsg), nil
}
func (m *ClusterService) decommissionMetaNode(ctx context.Context, args struct {
OffLineAddr string
}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
metaNode, err := m.cluster.metaNode(args.OffLineAddr)
if err != nil {
return nil, err
}
if err = m.cluster.decommissionMetaNode(metaNode); err != nil {
return nil, err
}
log.LogInfof("decommissionMetaNode metaNode [%v] has offline successfully", args.OffLineAddr)
return proto.Success("success"), nil
}
func (m *ClusterService) loadMetaPartition(ctx context.Context, args struct {
PartitionID uint64
}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
mp, err := m.cluster.getMetaPartitionByID(args.PartitionID)
if err != nil {
return nil, err
}
m.cluster.loadMetaPartitionAndCheckResponse(mp)
log.LogInfof(proto.AdminLoadMetaPartition+" partitionID :%v Load successfully", args.PartitionID)
return proto.Success("success"), nil
}
func (m *ClusterService) decommissionMetaPartition(ctx context.Context, args struct {
PartitionID uint64
NodeAddr string
}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
mp, err := m.cluster.getMetaPartitionByID(args.PartitionID)
if err != nil {
return nil, err
}
if err := m.cluster.decommissionMetaPartition(args.NodeAddr, mp); err != nil {
return nil, err
}
log.LogInfof(proto.AdminDecommissionMetaPartition+" partitionID :%v decommissionMetaPartition successfully", args.PartitionID)
return proto.Success("success"), nil
}
func (m *ClusterService) getMetaNode(ctx context.Context, args struct {
NodeAddr string
}) (*MetaNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
metaNode, err := m.cluster.metaNode(args.NodeAddr)
if err != nil {
return nil, err
}
return metaNode, nil
}
// View the topology of the cluster.
func (m *ClusterService) getTopology(ctx context.Context, args struct{}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
tv := &TopologyView{
Zones: make([]*ZoneView, 0),
}
zones := m.cluster.t.getAllZones()
for _, zone := range zones {
cv := newZoneView(zone.name)
cv.Status = zone.getStatusToString()
cv.DataNodesetSelector = zone.GetDataNodesetSelector()
cv.MetaNodesetSelector = zone.GetMetaNodesetSelector()
tv.Zones = append(tv.Zones, cv)
nsc := zone.getAllNodeSet()
for _, ns := range nsc {
nsView := newNodeSetView(ns.dataNodeLen(), ns.metaNodeLen())
cv.NodeSet[ns.ID] = nsView
ns.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
nsView.DataNodes = append(nsView.DataNodes, proto.NodeView{ID: dataNode.ID, Addr: dataNode.Addr, IsActive: dataNode.isActive, IsWritable: dataNode.isWriteAble()})
return true
})
ns.metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
nsView.MetaNodes = append(nsView.MetaNodes, proto.NodeView{ID: metaNode.ID, Addr: metaNode.Addr, IsActive: metaNode.IsActive, IsWritable: metaNode.isWritable()})
return true
})
}
}
bs, e := json.Marshal(tv)
if e != nil {
return nil, e
}
return proto.Success(string(bs)), e
}
func (s *ClusterService) clusterView(ctx context.Context, args struct{}) (*proto.ClusterView, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
return s.makeClusterView(), nil
}
type MasterInfo struct {
Index string
Addr string
IsLeader bool
}
func (s *ClusterService) masterList(ctx context.Context, args struct{}) ([]*MasterInfo, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
list := make([]*MasterInfo, 0)
leader := strings.Split(s.leaderInfo.addr, ":")
for _, addr := range s.conf.peerAddrs {
split := strings.Split(addr, ":")
list = append(list, &MasterInfo{
Index: split[0],
Addr: split[1],
IsLeader: leader[0] == split[1],
})
}
return list, nil
}
func (s *ClusterService) dataNodeGet(ctx context.Context, args struct {
Addr string
}) (*DataNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
return s.cluster.dataNode(args.Addr)
}
func (s *ClusterService) dataNodeList(ctx context.Context, args struct{}) ([]*DataNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
var all []*DataNode
s.cluster.dataNodes.Range(func(_, value interface{}) bool {
all = append(all, value.(*DataNode))
return true
})
return all, nil
}
func (s *ClusterService) dataNodeListTest(ctx context.Context, args struct {
Num int64
}) ([]*DataNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
var all []*DataNode
for i := 0; i < int(args.Num); i++ {
all = append(all, &DataNode{
Total: uint64(i),
Used: 1,
AvailableSpace: 1,
ID: 1,
ZoneName: "123",
Addr: "123123121231",
ReportTime: time.Time{},
isActive: false,
RWMutex: sync.RWMutex{},
UsageRatio: 1,
SelectedTimes: 2,
})
}
return all, nil
}
func (s *ClusterService) metaNodeGet(ctx context.Context, args struct {
Addr string
}) (*MetaNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
mn, found := s.cluster.metaNodes.Load(args.Addr)
if found {
return mn.(*MetaNode), nil
}
return nil, fmt.Errorf("not found meta_node by add:[%s]", args.Addr)
}
func (s *ClusterService) metaNodeList(ctx context.Context, args struct{}) ([]*MetaNode, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
var all []*MetaNode
s.cluster.metaNodes.Range(func(_, value interface{}) bool {
all = append(all, value.(*MetaNode))
return true
})
return all, nil
}
func (m *ClusterService) addMetaNode(ctx context.Context, args struct {
NodeAddr string
ZoneName string
}) (uint64, error) {
if id, err := m.cluster.addMetaNode(args.NodeAddr, args.ZoneName, 0); err != nil {
return 0, err
} else {
return id, nil
}
}
// Dynamically remove a master node. Similar to addRaftNode, this operation is performed online.
func (m *ClusterService) removeRaftNode(ctx context.Context, args struct {
Id uint64
Addr string
}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
if err := m.cluster.removeRaftNode(args.Id, args.Addr); err != nil {
return nil, err
}
log.LogInfof("remove raft node id :%v,adr:%v successfully\n", args.Id, args.Addr)
return proto.Success("success"), nil
}
// Dynamically add a raft node (replica) for the master.
// By using this function, there is no need to stop all the master services. Adding a new raft node is performed online.
func (m *ClusterService) addRaftNode(ctx context.Context, args struct {
Id uint64
Addr string
}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
if err := m.cluster.addRaftNode(args.Id, args.Addr); err != nil {
return nil, err
}
log.LogInfof("add raft node id :%v, addr:%v successfully \n", args.Id, args.Addr)
return proto.Success("success"), nil
}
// Turn on or off the automatic allocation of the data partitions.
// If DisableAutoAllocate == off, then we WILL NOT automatically allocate new data partitions for the volume when:
// 1. the used space is below the max capacity,
// 2. and the number of r&w data partition is less than 20.
//
// If DisableAutoAllocate == on, then we WILL automatically allocate new data partitions for the volume when:
// 1. the used space is below the max capacity,
// 2. and the number of r&w data partition is less than 20.
func (m *ClusterService) clusterFreeze(ctx context.Context, args struct {
Status bool
}) (*proto.GeneralResp, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
if err := m.cluster.setDisableAutoAllocate(args.Status); err != nil {
return nil, err
}
return proto.Success("success"), nil
}
type WarnMessage struct {
Time string `json:"time"`
Key string `json:"key"`
Hostname string `json:"hostname"`
Type string `json:"type"`
Value string `json:"value"`
Detail string `json:"detail"`
}
func (m *ClusterService) alarmList(ctx context.Context, args struct {
Size int32
}) ([]*WarnMessage, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
size := int64(args.Size * 1000)
list := make([]*WarnMessage, 0, 100)
path := filepath.Join(log.LogDir, "master"+log.CriticalLogFileName)
stat, err := os.Stat(path)
if err != nil {
list = append(list, &WarnMessage{
Time: time.Now().Format("2006-01-02 15:04:05"),
Key: "not found",
Hostname: m.leaderInfo.addr,
Type: "not found",
Value: "not found",
Detail: path + " read has err:" + err.Error(),
})
return list, nil
}
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("open file has err:[%s]", err.Error())
}
if stat.Size() > size {
if _, err := f.Seek(stat.Size()-size, 0); err != nil {
return nil, fmt.Errorf("seek file has err:[%s]", err.Error())
}
}
defer func() {
if err := f.Close(); err != nil {
log.LogErrorf("close alarm file has err:[%s]", err.Error())
}
}()
buf := bufio.NewReader(f)
all, err := io.ReadAll(buf)
if err != nil {
return nil, fmt.Errorf("read file:[%s] size:[%d] has err:[%s]", path, stat.Size(), err.Error())
}
for _, line := range strings.Split(string(all), "\n") {
if len(line) == 0 {
break
}
split := strings.Split(string(line), " ")
var msg *WarnMessage
if len(split) < 7 {
value := string(line)
msg = &WarnMessage{
Time: "unknow",
Key: "parse msg has err",
Hostname: "parse msg has err",
Type: "parse msg has err",
Value: value,
Detail: value,
}
} else {
value := strings.Join(split[6:], " ")
msg = &WarnMessage{
Time: split[0] + " " + split[1],
Key: split[4],
Hostname: split[5],
Type: split[2],
Value: value,
Detail: value,
}
}
list = append(list, msg)
}
// reverse slice
l := len(list)
for i := 0; i < l/2; i++ {
list[i], list[l-i-1] = list[l-i-1], list[i]
}
if len(list) > int(args.Size) {
list = list[:args.Size]
}
return list, nil
}
func (m *ClusterService) makeClusterView() *proto.ClusterView {
cv := &proto.ClusterView{
Name: m.cluster.Name,
LeaderAddr: m.cluster.leaderInfo.addr,
DisableAutoAlloc: m.cluster.DisableAutoAllocate,
ForbidMpDecommission: m.cluster.ForbidMpDecommission,
MetaNodeThreshold: m.cluster.cfg.MetaNodeThreshold,
Applied: m.cluster.fsm.applied,
MaxDataPartitionID: m.cluster.idAlloc.dataPartitionID,
MaxMetaNodeID: m.cluster.idAlloc.commonID,
MaxMetaPartitionID: m.cluster.idAlloc.metaPartitionID,
MetaNodes: make([]proto.NodeView, 0),
DataNodes: make([]proto.NodeView, 0),
VolStatInfo: make([]*proto.VolStatInfo, 0),
BadPartitionIDs: make([]proto.BadPartitionView, 0),
BadMetaPartitionIDs: make([]proto.BadPartitionView, 0),
}
vols := m.cluster.allVolNames()
cv.MetaNodes = m.cluster.allMetaNodes()
cv.DataNodes = m.cluster.allDataNodes()
cv.DataNodeStatInfo = m.cluster.dataNodeStatInfo
cv.MetaNodeStatInfo = m.cluster.metaNodeStatInfo
for _, name := range vols {
stat, ok := m.cluster.volStatInfo.Load(name)
if !ok {
cv.VolStatInfo = append(cv.VolStatInfo, newVolStatInfo(name, 0, 0, 0, 0, 0))
continue
}
cv.VolStatInfo = append(cv.VolStatInfo, stat.(*volStatInfo))
}
m.cluster.BadDataPartitionIds.Range(func(key, value interface{}) bool {
badDataPartitionIds := value.([]uint64)
path := key.(string)
bpv := badPartitionView{Path: path, PartitionIDs: badDataPartitionIds}
cv.BadPartitionIDs = append(cv.BadPartitionIDs, bpv)
return true
})
m.cluster.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
badPartitionIds := value.([]uint64)
path := key.(string)
bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds}
cv.BadMetaPartitionIDs = append(cv.BadMetaPartitionIDs, bpv)
return true
})
return cv
}
package master
import (
"context"
"crypto/sha256"
"encoding/hex"
"fmt"
"sort"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
"github.com/samsarahq/thunder/graphql"
"github.com/samsarahq/thunder/graphql/schemabuilder"
)
type UserService struct {
user *User
cluster *Cluster
}
func (s *UserService) Schema() *graphql.Schema {
schema := schemabuilder.NewSchema()
s.registerObject(schema)
s.registerQuery(schema)
s.registerMutation(schema)
return schema.MustBuild()
}
type UserStatistical struct {
Data uint64
VolumeCount int32
DataPartitionCount int32
MetaPartitionCount int32
}
type AuthorizedVols struct {
Vol string
Authorized []string
}
func (s *UserService) registerObject(schema *schemabuilder.Schema) {
object := schema.Object("UserInfo", proto.UserInfo{})
object.FieldFunc("userStatistical", func(u *proto.UserInfo) (*UserStatistical, error) {
us := &UserStatistical{
VolumeCount: int32(len(u.Policy.OwnVols)),
}
for _, volName := range u.Policy.OwnVols {
v, e := s.cluster.getVol(volName)
if e != nil {
return nil, e
}
us.MetaPartitionCount += int32(len(v.MetaPartitions))
us.DataPartitionCount += int32(len(v.dataPartitions.partitions))
us.Data += v.totalUsedSpace()
}
return us, nil
})
object = schema.Object("UserPolicy", proto.UserPolicy{})
object.FieldFunc("authorizedVols", func(p *proto.UserPolicy) []AuthorizedVols {
var list []AuthorizedVols
for vol, a := range p.AuthorizedVols {
list = append(list, AuthorizedVols{
Vol: vol,
Authorized: a,
})
}
return list
})
}
func (s *UserService) registerQuery(schema *schemabuilder.Schema) {
query := schema.Query()
query.FieldFunc("getUserInfo", s.getUserInfo)
query.FieldFunc("getUserAKInfo", s.getUserAKInfo)
query.FieldFunc("validatePassword", s.validatePassword)
query.FieldFunc("listUserInfo", s.listUserInfo)
query.FieldFunc("topNUser", s.topNUser)
}
func (m *UserService) getUserAKInfo(ctx context.Context, args struct {
AccessKey string
}) (*proto.UserInfo, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
userInfo, err := m.user.getKeyInfo(args.AccessKey)
if err != nil {
return nil, err
}
if perm != ADMIN {
if uid != userInfo.UserID {
return nil, fmt.Errorf("user info not found by you accesskey")
}
}
return userInfo, nil
}
func (s *UserService) registerMutation(schema *schemabuilder.Schema) {
mutation := schema.Mutation()
mutation.FieldFunc("createUser", s.createUser)
mutation.FieldFunc("updateUser", s.updateUser)
mutation.FieldFunc("deleteUser", s.deleteUser)
mutation.FieldFunc("updateUserPolicy", s.updateUserPolicy)
mutation.FieldFunc("removeUserPolicy", s.removeUserPolicy)
mutation.FieldFunc("transferUserVol", s.transferUserVol)
}
func (m *UserService) transferUserVol(ctx context.Context, args proto.UserTransferVolParam) (*proto.UserInfo, error) {
uid, perm, err := permissions(ctx, ADMIN)
if err != nil {
return nil, err
}
vol, err := m.cluster.getVol(args.Volume)
if err != nil {
return nil, err
}
if perm == USER && vol.Owner != uid {
return nil, fmt.Errorf("not have permission for vol:[%s]", args.Volume)
}
if !args.Force && vol.Owner != args.UserSrc {
return nil, fmt.Errorf("force param need validate user name for vol:[%s]", args.Volume)
}
userInfo, err := m.user.transferVol(&args)
if err != nil {
return nil, err
}
owner := vol.Owner
vol.Owner = userInfo.UserID
if err = m.cluster.syncUpdateVol(vol); err != nil {
vol.Owner = owner
return nil, err
}
return userInfo, nil
}
func (s *UserService) updateUserPolicy(ctx context.Context, args proto.UserPermUpdateParam) (*proto.UserInfo, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
if args.Volume == "" {
return nil, fmt.Errorf("user:[%s] need set userID", uid)
}
if v, e := s.cluster.getVol(args.Volume); e != nil {
return nil, e
} else {
if v.Owner != uid {
return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.UserID)
}
}
}
if _, err := s.cluster.getVol(args.Volume); err != nil {
return nil, err
}
userInfo, err := s.user.updatePolicy(&args)
if err != nil {
return nil, err
}
return userInfo, nil
}
func (s *UserService) removeUserPolicy(ctx context.Context, args proto.UserPermRemoveParam) (*proto.UserInfo, error) {
if _, err := s.cluster.getVol(args.Volume); err != nil {
return nil, err
}
userInfo, err := s.user.removePolicy(&args)
if err != nil {
return nil, err
}
return userInfo, nil
}
func (s *UserService) createUser(ctx context.Context, args proto.UserCreateParam) (*proto.UserInfo, error) {
uid, _, err := permissions(ctx, ADMIN)
if err != nil {
return nil, err
}
if !ownerRegexp.MatchString(args.ID) {
return nil, fmt.Errorf("user id:[%s] is invalid", args.ID)
}
if args.Type == proto.UserTypeRoot {
return nil, fmt.Errorf("user type:[%s] can not to root", args.Type)
}
log.LogInfof("create user:[%s] by admin:[%s]", args.ID, uid)
return s.user.createKey(&args)
}
func (s *UserService) updateUser(ctx context.Context, args proto.UserUpdateParam) (*proto.UserInfo, error) {
uid, _, err := permissions(ctx, ADMIN)
if err != nil {
return nil, err
}
old, err := s.user.getUserInfo(args.UserID)
if err != nil {
return nil, err
}
if old.UserType != args.Type && args.Type == proto.UserTypeRoot {
return nil, fmt.Errorf("user type:[%s] can not to root", args.Type)
}
log.LogInfof("update user:[%s] by admin:[%s]", args.UserID, uid)
return s.user.updateKey(&args)
}
func (s *UserService) deleteUser(ctx context.Context, args struct {
UserID string
}) (*proto.GeneralResp, error) {
uid, _, err := permissions(ctx, ADMIN)
if err != nil {
return nil, err
}
// TODO : make sure can delete self? can delete other admin ??
log.LogInfof("delete user:[%s] by admin:[%s]", args.UserID, uid)
if err := s.user.deleteKey(args.UserID); err != nil {
return nil, err
}
return proto.Success("del user ok"), nil
}
func (s *UserService) getUserInfo(ctx context.Context, args struct {
UserID string
}) (*proto.UserInfo, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
if uid != args.UserID {
return nil, fmt.Errorf("you:[%s] not have permission visit this userID:[%s]", uid, args.UserID)
}
}
return s.user.getUserInfo(args.UserID)
}
func (s *UserService) listUserInfo(ctx context.Context, args struct{}) ([]*proto.UserInfo, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
var list []*proto.UserInfo
s.user.userStore.Range(func(_, ui interface{}) bool {
list = append(list, ui.(*proto.UserInfo))
return true
})
return list, nil
}
type UserUseSpace struct {
Name string
Size uint64
Ratio float32
}
func (s *UserService) topNUser(ctx context.Context, args struct {
N int32
}) ([]*UserUseSpace, error) {
if _, _, err := permissions(ctx, ADMIN); err != nil {
return nil, err
}
list := make([]*UserUseSpace, 0)
var err error
s.user.userStore.Range(func(_, ui interface{}) bool {
u := ui.(*proto.UserInfo)
us := &UserUseSpace{
Name: u.UserID,
Size: 0,
Ratio: 0,
}
for _, volName := range u.Policy.OwnVols {
v, e := s.cluster.getVol(volName)
if e != nil {
err = e
return false
}
us.Size += v.totalUsedSpace()
}
list = append(list, us)
return true
})
if err != nil {
return nil, err
}
sort.Slice(list, func(i int, j int) bool {
return list[i].Size > list[j].Size
})
if len(list) > 10 {
list = list[:10]
}
var sum uint64
for _, u := range list {
sum += u.Size
}
for _, u := range list {
if sum == 0 {
u.Ratio = float32(1) / float32(len(list))
} else {
u.Ratio = float32(u.Size) / float32(sum)
}
}
return list, nil
}
func (s *UserService) validatePassword(ctx context.Context, args struct {
UserID string
Password string
}) (*proto.UserInfo, error) {
ui, err := s.user.getUserInfo(args.UserID)
if err != nil {
return nil, err
}
ak, err := s.user.getAKUser(ui.AccessKey)
if err != nil {
return nil, err
}
hashedPassword := sha256.Sum256([]byte(args.Password))
hashedPasswordStr := hex.EncodeToString(hashedPassword[:])
hashedPassword_ := sha256.Sum256([]byte(ak.Password))
hashedPasswordStr_ := hex.EncodeToString(hashedPassword_[:])
if hashedPasswordStr != hashedPasswordStr_ {
log.LogWarnf("user:[%s] login pass word has err", args.UserID)
return nil, fmt.Errorf("user or password has err")
}
return ui, nil
}
type permissionMode int
const (
ADMIN permissionMode = permissionMode(1)
USER permissionMode = permissionMode(2)
)
func permissions(ctx context.Context, mode permissionMode) (userID string, perm permissionMode, err error) {
userInfo := ctx.Value(proto.UserInfoKey).(*proto.UserInfo)
userID = userInfo.UserID
perm = USER
if userInfo.UserType == proto.UserTypeRoot || userInfo.UserType == proto.UserTypeAdmin {
perm = ADMIN
}
if ADMIN&mode == ADMIN {
if perm == ADMIN {
return
}
}
if USER&mode == USER {
if perm == USER {
return
}
}
err = fmt.Errorf("user:[%s] permissions has err:[%d] your:[%d]", userInfo.UserID, mode, perm)
return
}
package master
import (
"context"
"fmt"
"math"
"sort"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
"github.com/samsarahq/thunder/graphql"
"github.com/samsarahq/thunder/graphql/schemabuilder"
)
type VolumeService struct {
user *User
cluster *Cluster
}
func (s *VolumeService) Schema() *graphql.Schema {
schema := schemabuilder.NewSchema()
s.registerObject(schema)
s.registerQuery(schema)
s.registerMutation(schema)
return schema.MustBuild()
}
func (s *VolumeService) registerObject(schema *schemabuilder.Schema) {
object := schema.Object("Vol", Vol{})
object.FieldFunc("dpReplicaNum", func(ctx context.Context, v *Vol) (int32, error) {
if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
return 0, err
}
return int32(v.dpReplicaNum), nil
})
object.FieldFunc("occupied", func(ctx context.Context, v *Vol) (int64, error) {
if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
return 0, err
}
var used int64
for _, p := range v.cloneDataPartitionMap() {
used += int64(p.used)
}
return used, nil
})
object.FieldFunc("toSimpleVolView", func(ctx context.Context, vol *Vol) (*proto.SimpleVolView, error) {
if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
return nil, err
}
return &proto.SimpleVolView{
ID: vol.ID,
Name: vol.Name,
Owner: vol.Owner,
ZoneName: vol.zoneName,
DpReplicaNum: vol.dpReplicaNum,
MpReplicaNum: vol.mpReplicaNum,
Status: vol.Status,
Capacity: vol.Capacity,
FollowerRead: vol.FollowerRead,
NeedToLowerReplica: vol.NeedToLowerReplica,
Authenticate: vol.authenticate,
CrossZone: vol.crossZone,
RwDpCnt: vol.dataPartitions.readableAndWritableCnt,
MpCnt: len(vol.MetaPartitions),
DpCnt: len(vol.dataPartitions.partitionMap),
CreateTime: time.Unix(vol.createTime, 0).Format(proto.TimeFormat),
Description: vol.description,
}, nil
})
object.FieldFunc("createTime", func(ctx context.Context, v *Vol) (int64, error) {
if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
return 0, err
}
return v.createTime, nil
})
}
func (s *VolumeService) registerQuery(schema *schemabuilder.Schema) {
query := schema.Query()
query.FieldFunc("getVolume", s.getVolume)
query.FieldFunc("listVolume", s.listVolume)
query.FieldFunc("volPermission", s.volPermission)
}
func (s *VolumeService) registerMutation(schema *schemabuilder.Schema) {
mutation := schema.Mutation()
mutation.FieldFunc("createVolume", s.createVolume)
// mutation.FieldFunc("deleteVolume", s.markDeleteVol)
mutation.FieldFunc("updateVolume", s.updateVolume)
}
type UserPermission struct {
UserID string
Access []string
Edit bool
}
func (s *VolumeService) volPermission(ctx context.Context, args struct {
VolName string
UserID *string
},
) ([]*UserPermission, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
if args.UserID == nil {
return nil, fmt.Errorf("user:[%s] need set userID", uid)
}
if v, e := s.cluster.getVol(*args.UserID); e != nil {
return nil, e
} else {
if v.Owner != uid {
return nil, fmt.Errorf("user:[%s] is not volume:[%d] onwer", uid, args.UserID)
}
}
}
vol, err := s.cluster.getVol(args.VolName)
if err != nil {
return nil, err
}
var volUser *proto.VolUser
if value, exist := s.user.volUser.Load(args.VolName); exist {
volUser = value.(*proto.VolUser)
} else {
return nil, fmt.Errorf("not found vol user in cluster")
}
userPList := make([]*UserPermission, 0, len(volUser.UserIDs))
userMap := make(map[string]bool)
for _, u := range volUser.UserIDs {
v, e := s.user.getUserInfo(u)
if e != nil {
log.LogWarnf("get user info by vol has err:[%s]", e.Error())
continue
}
if arr, exist := v.Policy.AuthorizedVols[args.VolName]; exist {
if userMap[u] {
continue
}
userMap[u] = true
userPList = append(userPList, &UserPermission{
UserID: u,
Access: arr,
Edit: uid == vol.Owner,
})
} else {
log.LogWarnf("get vol:[%s] author:[%s] by user policy has err ", args.VolName, u)
}
}
sort.Slice(userPList, func(i, j int) bool {
return userPList[i].Edit
})
return userPList, nil
}
func (s *VolumeService) createVolume(ctx context.Context, args struct {
Name, Owner, ZoneName, Description string
Capacity, DataPartitionSize, MpCount, DpCount, DpReplicaNum uint64
FollowerRead, Authenticate, CrossZone, DefaultPriority bool
iopsRLimit, iopsWLimit, flowRlimit, flowWlimit uint64
},
) (*Vol, error) {
uid, per, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if !(args.DpReplicaNum == 2 || args.DpReplicaNum == 3) {
return nil, fmt.Errorf("replicaNum can only be 2 and 3,received replicaNum is[%v]", args.DpReplicaNum)
}
if per == USER && args.Owner != uid {
return nil, fmt.Errorf("[%s] not has permission to create volume for [%s]", uid, args.Owner)
}
if args.DpReplicaNum > math.MaxUint8 {
return nil, fmt.Errorf("invalid arg dpReplicaNum: %v", args.DpReplicaNum)
}
if args.DpCount > maxInitDataPartitionCnt {
return nil, fmt.Errorf("invalid arg dpCount[%v], exceeds maximum limit[%d]", args.DpCount, maxInitDataPartitionCnt)
}
req := &createVolReq{
name: args.Name,
owner: args.Owner,
dpSize: int(args.DataPartitionSize),
mpCount: int(args.MpCount),
dpCount: int(args.DpCount),
dpReplicaNum: uint8(args.DpReplicaNum),
capacity: int(args.Capacity),
followerRead: args.FollowerRead,
authenticate: args.Authenticate,
crossZone: args.CrossZone,
normalZonesFirst: args.DefaultPriority,
zoneName: args.ZoneName,
description: args.Description,
}
vol, err := s.cluster.createVol(req)
if err != nil {
return nil, err
}
userInfo, err := s.user.getUserInfo(args.Owner)
if err != nil {
if err != proto.ErrUserNotExists {
return nil, err
}
param := proto.UserCreateParam{
ID: args.Owner,
Password: DefaultUserPassword,
Type: proto.UserTypeNormal,
}
if userInfo, err = s.user.createKey(¶m); err != nil {
return nil, err
}
}
if _, err = s.user.addOwnVol(userInfo.UserID, args.Name); err != nil {
return nil, err
}
return vol, nil
}
func (s *VolumeService) markDeleteVol(ctx context.Context, args struct {
Name, AuthKey string
},
) (*proto.GeneralResp, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
if v, e := s.cluster.getVol(args.Name); e != nil {
return nil, e
} else {
if v.Owner != uid {
return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.Name)
}
}
}
if err = s.user.deleteVolPolicy(args.Name); err != nil {
return nil, err
}
if err = s.cluster.markDeleteVol(args.Name, args.AuthKey, false); err != nil {
return nil, err
}
log.LogWarnf("delete vol[%s] successfully,from[%s]", args.Name, uid)
return proto.Success("success"), nil
}
func (s *VolumeService) updateVolume(ctx context.Context, args struct {
Name, AuthKey string
ZoneName, Description *string
Capacity, ReplicaNum *uint64
FollowerRead, Authenticate *bool
},
) (*Vol, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
if v, e := s.cluster.getVol(args.Name); e != nil {
return nil, e
} else {
if v.Owner != uid {
return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.Name)
}
}
}
if args.ReplicaNum != nil && !(*args.ReplicaNum == 2 || *args.ReplicaNum == 3) {
return nil, fmt.Errorf("replicaNum can only be 2 and 3,received replicaNum is[%v]", args.ReplicaNum)
}
vol, err := s.cluster.getVol(args.Name)
if err != nil {
return nil, err
}
newArgs := getVolVarargs(vol)
if args.FollowerRead != nil {
newArgs.followerRead = *args.FollowerRead
}
if args.Authenticate != nil {
newArgs.authenticate = *args.Authenticate
}
if args.ZoneName != nil {
newArgs.zoneName = *args.ZoneName
}
if args.Capacity != nil {
newArgs.capacity = *args.Capacity
}
if args.Description != nil {
newArgs.description = *args.Description
}
if err = s.cluster.updateVol(args.Name, args.AuthKey, newArgs); err != nil {
return nil, err
}
log.LogInfof("update vol[%v] successfully\n", args.Name)
vol, err = s.cluster.getVol(args.Name)
if err != nil {
return nil, err
}
return vol, nil
}
func (s *VolumeService) listVolume(ctx context.Context, args struct {
UserID *string
Keyword *string
},
) ([]*Vol, error) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
args.UserID = &uid
}
var list []*Vol
for _, vol := range s.cluster.vols {
if args.UserID != nil && vol.Owner != *args.UserID {
continue
}
if args.Keyword != nil && *args.Keyword != "" && strings.Contains(vol.Name, *args.Keyword) {
continue
}
if vol.Status == proto.VolStatusMarkDelete {
continue
}
list = append(list, vol)
}
return list, nil
}
func (s *VolumeService) getVolume(ctx context.Context, args struct {
Name string
},
) (*Vol, error,
) {
uid, perm, err := permissions(ctx, ADMIN|USER)
if err != nil {
return nil, err
}
if perm == USER {
if v, e := s.cluster.getVol(args.Name); e != nil {
return nil, e
} else {
if v.Owner != uid {
return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.Name)
}
}
}
vol, err := s.cluster.getVol(args.Name)
if err != nil {
return nil, err
}
return vol, nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"context"
"encoding/json"
"fmt"
"html"
"net/http"
"net/http/httputil"
"strings"
"time"
"github.com/gorilla/mux"
"github.com/samsarahq/thunder/graphql"
"github.com/samsarahq/thunder/graphql/introspection"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
func (m *Server) startHTTPService(modulename string, cfg *config.Config) {
router := mux.NewRouter().SkipClean(true)
m.registerAPIRoutes(router)
m.registerAPIMiddleware(router)
if m.cluster.authenticate {
m.registerAuthenticationMiddleware(router)
}
exporter.InitWithRouter(modulename, cfg, router, m.port)
addr := fmt.Sprintf(":%s", m.port)
if m.bindIp {
addr = fmt.Sprintf("%s:%s", m.ip, m.port)
}
server := &http.Server{
Addr: addr,
Handler: router,
ReadTimeout: 5 * time.Minute,
WriteTimeout: 5 * time.Minute,
}
serveAPI := func() {
if err := server.ListenAndServe(); err != nil {
log.LogErrorf("serveAPI: serve http server failed: err(%v)", err)
return
}
}
go serveAPI()
m.apiServer = server
return
}
func (m *Server) isClientPartitionsReq(r *http.Request) bool {
return r.URL.Path == proto.ClientDataPartitions
}
func (m *Server) isFollowerRead(r *http.Request) (followerRead bool) {
followerRead = false
if r.URL.Path == proto.ClientDataPartitions && !m.partition.IsRaftLeader() {
if volName, err := parseAndExtractName(r); err == nil {
log.LogInfof("action[interceptor] followerRead vol[%v]", volName)
if followerRead = m.cluster.followerReadManager.IsVolViewReady(volName); followerRead {
log.LogInfof("action[interceptor] vol [%v] followerRead [%v], GetName[%v] IsRaftLeader[%v]",
volName, followerRead, r.URL.Path, m.partition.IsRaftLeader())
return
}
}
} else if r.URL.Path == proto.AdminChangeMasterLeader ||
r.URL.Path == proto.AdminOpFollowerPartitionsRead ||
r.URL.Path == proto.AdminPutDataPartitions ||
r.URL.Path == "/metrics" {
followerRead = true
}
return
}
func (m *Server) registerAPIMiddleware(route *mux.Router) {
var interceptor mux.MiddlewareFunc = func(next http.Handler) http.Handler {
return http.HandlerFunc(
func(w http.ResponseWriter, r *http.Request) {
log.LogDebugf("action[interceptor] request, method[%v] path[%v] query[%v]", r.Method, r.URL.Path, r.URL.Query())
if m.partition.IsRaftLeader() {
if err := m.cluster.apiLimiter.Wait(r.URL.Path); err != nil {
log.LogWarnf("action[interceptor] too many requests, path[%v]", r.URL.Path)
errMsg := fmt.Sprintf("too many requests for api: %s", html.EscapeString(r.URL.Path))
http.Error(w, errMsg, http.StatusTooManyRequests)
return
}
} else {
if m.cluster.apiLimiter.IsFollowerLimiter(r.URL.Path) {
if err := m.cluster.apiLimiter.Wait(r.URL.Path); err != nil {
log.LogWarnf("action[interceptor] too many requests, path[%v]", r.URL.Path)
errMsg := fmt.Sprintf("too many requests for api: %s", html.EscapeString(r.URL.Path))
http.Error(w, errMsg, http.StatusTooManyRequests)
return
}
}
}
log.LogInfof("action[interceptor] request, remote[%v] method[%v] path[%v] query[%v]",
r.RemoteAddr, r.Method, r.URL.Path, r.URL.Query())
if mux.CurrentRoute(r).GetName() == proto.AdminGetIP {
next.ServeHTTP(w, r)
return
}
isFollowerRead := m.isFollowerRead(r)
if m.partition.IsRaftLeader() || isFollowerRead {
if m.metaReady || isFollowerRead {
log.LogDebugf("action[interceptor] request, method[%v] path[%v] query[%v]", r.Method, r.URL.Path, r.URL.Query())
next.ServeHTTP(w, r)
return
}
log.LogWarnf("action[interceptor] leader meta has not ready")
http.Error(w, m.leaderInfo.addr, http.StatusBadRequest)
return
} else if m.leaderInfo.addr != "" {
if m.isClientPartitionsReq(r) {
log.LogErrorf("action[interceptor] request, method[%v] path[%v] query[%v] status [%v]", r.Method, r.URL.Path, r.URL.Query(), isFollowerRead)
http.Error(w, m.leaderInfo.addr, http.StatusBadRequest)
return
}
m.proxy(w, r)
} else {
log.LogErrorf("action[interceptor] no leader,request[%v]", r.URL)
http.Error(w, "no leader", http.StatusBadRequest)
return
}
})
}
route.Use(interceptor)
}
// AuthenticationUri2MsgTypeMap define the mapping from authentication uri to message type
var AuthenticationUri2MsgTypeMap = map[string]proto.MsgType{
// Master API cluster management
proto.AdminClusterFreeze: proto.MsgMasterClusterFreezeReq,
proto.AddRaftNode: proto.MsgMasterAddRaftNodeReq,
proto.RemoveRaftNode: proto.MsgMasterRemoveRaftNodeReq,
proto.AdminSetNodeInfo: proto.MsgMasterSetNodeInfoReq,
proto.AdminSetNodeRdOnly: proto.MsgMasterSetNodeRdOnlyReq,
// Master API volume management
proto.AdminCreateVol: proto.MsgMasterCreateVolReq,
proto.AdminDeleteVol: proto.MsgMasterDeleteVolReq,
proto.AdminUpdateVol: proto.MsgMasterUpdateVolReq,
proto.AdminVolShrink: proto.MsgMasterVolShrinkReq,
proto.AdminVolExpand: proto.MsgMasterVolExpandReq,
// Master API meta partition management
proto.AdminLoadMetaPartition: proto.MsgMasterLoadMetaPartitionReq,
proto.AdminDecommissionMetaPartition: proto.MsgMasterDecommissionMetaPartitionReq,
proto.AdminChangeMetaPartitionLeader: proto.MsgMasterChangeMetaPartitionLeaderReq,
proto.AdminCreateMetaPartition: proto.MsgMasterCreateMetaPartitionReq,
proto.AdminAddMetaReplica: proto.MsgMasterAddMetaReplicaReq,
proto.AdminDeleteMetaReplica: proto.MsgMasterDeleteMetaReplicaReq,
proto.QosUpdate: proto.MsgMasterQosUpdateReq,
proto.QosUpdateZoneLimit: proto.MsgMasterQosUpdateZoneLimitReq,
proto.QosUpdateMasterLimit: proto.MsgMasterQosUpdateMasterLimitReq,
proto.QosUpdateClientParam: proto.MsgMasterQosUpdateClientParamReq,
// Master API data partition management
proto.AdminCreateDataPartition: proto.MsgMasterCreateDataPartitionReq,
proto.AdminDataPartitionChangeLeader: proto.MsgMasterDataPartitionChangeLeaderReq,
proto.AdminLoadDataPartition: proto.MsgMasterLoadDataPartitionReq,
proto.AdminDecommissionDataPartition: proto.MsgMasterDecommissionDataPartitionReq,
proto.AdminAddDataReplica: proto.MsgMasterAddDataReplicaReq,
proto.AdminDeleteDataReplica: proto.MsgMasterDeleteDataReplicaReq,
proto.AdminSetDpRdOnly: proto.MsgMasterSetDpRdOnlyReq,
// Master API meta node management
proto.AddMetaNode: proto.MsgMasterAddMetaNodeReq,
proto.DecommissionMetaNode: proto.MsgMasterDecommissionMetaNodeReq,
proto.MigrateMetaNode: proto.MsgMasterMigrateMetaNodeReq,
proto.AdminSetMetaNodeThreshold: proto.MsgMasterSetMetaNodeThresholdReq,
proto.AdminUpdateMetaNode: proto.MsgMasterUpdateMetaNodeReq,
// Master API data node management
proto.AddDataNode: proto.MsgMasterAddDataNodeReq,
proto.DecommissionDataNode: proto.MsgMasterDecommissionDataNodeReq,
proto.MigrateDataNode: proto.MsgMasterMigrateDataNodeReq,
proto.CancelDecommissionDataNode: proto.MsgMasterCancelDecommissionDataNodeReq,
proto.DecommissionDisk: proto.MsgMasterDecommissionDiskReq,
proto.AdminUpdateNodeSetCapcity: proto.MsgMasterUpdateNodeSetCapcityReq,
proto.AdminUpdateNodeSetId: proto.MsgMasterUpdateNodeSetIdReq,
proto.AdminUpdateDomainDataUseRatio: proto.MsgMasterUpdateDomainDataUseRatioReq,
proto.AdminUpdateZoneExcludeRatio: proto.MsgMasterUpdateZoneExcludeRatioReq,
proto.RecommissionDisk: proto.MsgMasterRecommissionDiskReq,
// Master API user management
proto.UserCreate: proto.MsgMasterUserCreateReq,
proto.UserDelete: proto.MsgMasterUserDeleteReq,
proto.UserUpdate: proto.MsgMasterUserUpdateReq,
proto.UserUpdatePolicy: proto.MsgMasterUserUpdatePolicyReq,
proto.UserRemovePolicy: proto.MsgMasterUserRemovePolicyReq,
proto.UserDeleteVolPolicy: proto.MsgMasterUserDeleteVolPolicyReq,
proto.UserTransferVol: proto.MsgMasterUserTransferVolReq,
// Master API zone management
proto.UpdateZone: proto.MsgMasterUpdateZoneReq,
}
func (m *Server) registerAuthenticationMiddleware(router *mux.Router) {
authenticationInterceptor := func(next http.Handler) http.Handler {
return http.HandlerFunc(
func(w http.ResponseWriter, r *http.Request) {
split := strings.Split(r.RequestURI, "?")
uriPath := split[0]
msgType, match := AuthenticationUri2MsgTypeMap[uriPath]
if match {
if err := m.cluster.parseAndCheckClientIDKey(r, msgType); err != nil {
log.LogInfof("action[AuthenticationInterceptor] parseAndCheckClientKey failed, RequestURI[%v], err[%v]",
r.RequestURI, err)
sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInvalidClientIDKey, Msg: err.Error()})
return
}
}
next.ServeHTTP(w, r)
})
}
router.Use(authenticationInterceptor)
}
func (m *Server) registerAPIRoutes(router *mux.Router) {
// graphql api for cluster
cs := &ClusterService{user: m.user, cluster: m.cluster, conf: m.config, leaderInfo: m.leaderInfo}
m.registerHandler(router, proto.AdminClusterAPI, cs.Schema())
us := &UserService{user: m.user, cluster: m.cluster}
m.registerHandler(router, proto.AdminUserAPI, us.Schema())
// vs := &VolumeService{user: m.user, cluster: m.cluster}
// m.registerHandler(router, proto.AdminVolumeAPI, vs.Schema())
// cluster management APIs
router.NewRoute().Name(proto.AdminGetMasterApiList).
Methods(http.MethodGet).
Path(proto.AdminGetMasterApiList).
HandlerFunc(m.getApiList)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetApiQpsLimit).
HandlerFunc(m.setApiQpsLimit)
router.NewRoute().Name(proto.AdminGetApiQpsLimit).
Methods(http.MethodGet).
Path(proto.AdminGetApiQpsLimit).
HandlerFunc(m.getApiQpsLimit)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminRemoveApiQpsLimit).
HandlerFunc(m.rmApiQpsLimit)
router.NewRoute().Name(proto.AdminGetIP).
Methods(http.MethodGet).
Path(proto.AdminGetIP).
HandlerFunc(m.getIPAddr)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetCluster).
HandlerFunc(m.getCluster)
router.NewRoute().Name(proto.AdminACL).
Methods(http.MethodGet).
Path(proto.AdminACL).
HandlerFunc(m.aclOperate)
router.NewRoute().Name(proto.AdminUid).
Methods(http.MethodGet).
Path(proto.AdminUid).
HandlerFunc(m.UidOperate)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetClusterInfo).
HandlerFunc(m.setClusterInfo)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetMonitorPushAddr).
HandlerFunc(m.getMonitorPushAddr)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminClusterFreeze).
HandlerFunc(m.setupAutoAllocation)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminVolForbidden).
HandlerFunc(m.forbidVolume)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminVolEnableAuditLog).
HandlerFunc(m.setEnableAuditLogForVolume)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminClusterForbidMpDecommission).
HandlerFunc(m.setupForbidMetaPartitionDecommission)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AddRaftNode).
HandlerFunc(m.addRaftNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.RemoveRaftNode).
HandlerFunc(m.removeRaftNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.RaftStatus).
HandlerFunc(m.getRaftStatus)
router.NewRoute().Methods(http.MethodGet).Path(proto.AdminClusterStat).HandlerFunc(m.clusterStat)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetCheckDataReplicasEnable).
HandlerFunc(m.setCheckDataReplicasEnable)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetConfig).
HandlerFunc(m.setConfigHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetConfig).
HandlerFunc(m.getConfigHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateDecommissionLimit).
HandlerFunc(m.updateDecommissionLimit)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminQueryDecommissionLimit).
HandlerFunc(m.queryDecommissionLimit)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminQueryDecommissionToken).
HandlerFunc(m.queryDecommissionToken)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetFileStats).
HandlerFunc(m.setFileStats)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetFileStats).
HandlerFunc(m.getFileStats)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetClusterUuidEnable).
HandlerFunc(m.setClusterUuidEnable)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetClusterUuid).
HandlerFunc(m.getClusterUuid)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGenerateClusterUuid).
HandlerFunc(m.generateClusterUuid)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetClusterValue).
HandlerFunc(m.GetClusterValue)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateDecommissionDiskFactor).
HandlerFunc(m.updateDecommissionDiskFactor)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminQueryDecommissionDiskLimit).
HandlerFunc(m.queryDecommissionDiskLimit)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminEnableAutoDecommissionDisk).
HandlerFunc(m.enableAutoDecommissionDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminQueryAutoDecommissionDisk).
HandlerFunc(m.queryAutoDecommissionDisk)
// volume management APIs
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminCreateVol).
HandlerFunc(m.createVol)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetVol).
HandlerFunc(m.getVolSimpleInfo)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDeleteVol).
HandlerFunc(m.markDeleteVol)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateVol).
HandlerFunc(m.updateVol)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminVolShrink).
HandlerFunc(m.volShrink)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminVolExpand).
HandlerFunc(m.volExpand)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.ClientVol).
HandlerFunc(m.getVol)
router.NewRoute().Methods(http.MethodGet).
Path(proto.ClientVolStat).
HandlerFunc(m.getVolStatInfo)
router.NewRoute().Methods(http.MethodGet).
Path(proto.GetTopologyView).
HandlerFunc(m.getTopology)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminListVols).
HandlerFunc(m.listVols)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminChangeMasterLeader).
HandlerFunc(m.changeMasterLeader)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminOpFollowerPartitionsRead).
HandlerFunc(m.OpFollowerPartitionsRead)
// multi version snapshot APIs
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminCreateVersion).
HandlerFunc(m.CreateVersion)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminDelVersion).
HandlerFunc(m.DelVersion)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetVersionInfo).
HandlerFunc(m.GetVersionInfo)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetAllVersionInfo).
HandlerFunc(m.GetAllVersionInfo)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetVolVer).
HandlerFunc(m.getVolVer)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetVerStrategy).
HandlerFunc(m.SetVerStrategy)
// S3 lifecycle configuration APIS
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.SetBucketLifecycle).
HandlerFunc(m.SetBucketLifecycle)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.GetBucketLifecycle).
HandlerFunc(m.GetBucketLifecycle)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.DeleteBucketLifecycle).
HandlerFunc(m.DelBucketLifecycle)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AddLcNode).
HandlerFunc(m.addLcNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminLcNode).
HandlerFunc(m.lcnodeInfo)
// node task response APIs
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.GetDataNodeTaskResponse).
HandlerFunc(m.handleDataNodeTaskResponse)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.GetMetaNodeTaskResponse).
HandlerFunc(m.handleMetaNodeTaskResponse)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.GetLcNodeTaskResponse).
HandlerFunc(m.handleLcNodeTaskResponse)
// meta partition management APIs
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminLoadMetaPartition).
HandlerFunc(m.loadMetaPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDecommissionMetaPartition).
HandlerFunc(m.decommissionMetaPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminChangeMetaPartitionLeader).
HandlerFunc(m.changeMetaPartitionLeader)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminBalanceMetaPartitionLeader).
HandlerFunc(m.balanceMetaPartitionLeader)
router.NewRoute().Methods(http.MethodGet).
Path(proto.ClientMetaPartitions).
HandlerFunc(m.getMetaPartitions)
router.NewRoute().Methods(http.MethodGet).
Path(proto.ClientMetaPartition).
HandlerFunc(m.getMetaPartition)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosUpload).
HandlerFunc(m.qosUpload)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosGetStatus).
HandlerFunc(m.getQosStatus)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosGetClientsLimitInfo).
HandlerFunc(m.getClientQosInfo)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosUpdate).
HandlerFunc(m.QosUpdate)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosUpdateZoneLimit).
HandlerFunc(m.QosUpdateZoneLimit)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosGetZoneLimitInfo).
HandlerFunc(m.QosGetZoneLimit)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosUpdateMasterLimit).
HandlerFunc(m.getQosUpdateMasterLimit)
// router.NewRoute().Methods(http.MethodGet).
// Path(proto.QosUpdateMagnify).
// HandlerFunc(m.QosUpdateMagnify)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QosUpdateClientParam).
HandlerFunc(m.QosUpdateClientParam)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminCreateMetaPartition).
HandlerFunc(m.createMetaPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminAddMetaReplica).
HandlerFunc(m.addMetaReplica)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDeleteMetaReplica).
HandlerFunc(m.deleteMetaReplica)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDiagnoseMetaPartition).
HandlerFunc(m.diagnoseMetaPartition)
// data partition management APIs
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminGetDataPartition).
HandlerFunc(m.getDataPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminCreateDataPartition).
HandlerFunc(m.createDataPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminCreatePreLoadDataPartition).
HandlerFunc(m.createPreLoadDataPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDataPartitionChangeLeader).
HandlerFunc(m.changeDataPartitionLeader)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminLoadDataPartition).
HandlerFunc(m.loadDataPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDecommissionDataPartition).
HandlerFunc(m.decommissionDataPartition)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDiagnoseDataPartition).
HandlerFunc(m.diagnoseDataPartition)
router.NewRoute().Methods(http.MethodGet).
Path(proto.ClientDataPartitions).
HandlerFunc(m.getDataPartitions)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminResetDataPartitionDecommissionStatus).
HandlerFunc(m.resetDataPartitionDecommissionStatus)
router.NewRoute().Methods(http.MethodGet).
Path(proto.AdminQueryDataPartitionDecommissionStatus).
HandlerFunc(m.queryDataPartitionDecommissionStatus)
// meta node management APIs
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AddMetaNode).
HandlerFunc(m.addMetaNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.DecommissionMetaNode).
HandlerFunc(m.decommissionMetaNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.MigrateMetaNode).
HandlerFunc(m.migrateMetaNodeHandler)
router.NewRoute().Methods(http.MethodGet).
Path(proto.GetMetaNode).
HandlerFunc(m.getMetaNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetMetaNodeThreshold).
HandlerFunc(m.setMetaNodeThreshold)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminAddDataReplica).
HandlerFunc(m.addDataReplica)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminDeleteDataReplica).
HandlerFunc(m.deleteDataReplica)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateMetaNode).
HandlerFunc(m.updateMetaNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateDataNode).
HandlerFunc(m.updateDataNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetInvalidNodes).
HandlerFunc(m.checkInvalidIDNodes)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminPutDataPartitions).
HandlerFunc(m.putDataPartitions)
// data node management APIs
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AddDataNode).
HandlerFunc(m.addDataNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.DecommissionDataNode).
HandlerFunc(m.decommissionDataNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryDataNodeDecoProgress).
HandlerFunc(m.queryDataNodeDecoProgress)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.MigrateDataNode).
HandlerFunc(m.migrateDataNodeHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.CancelDecommissionDataNode).
HandlerFunc(m.cancelDecommissionDataNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryDataNodeDecoFailedDps).
HandlerFunc(m.queryDataNodeDecoFailedDps)
router.NewRoute().Methods(http.MethodGet).
Path(proto.GetDataNode).
HandlerFunc(m.getDataNode)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.DecommissionDisk).
HandlerFunc(m.decommissionDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.RecommissionDisk).
HandlerFunc(m.recommissionDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.RestoreStoppedAutoDecommissionDisk).
HandlerFunc(m.restoreStoppedAutoDecommissionDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryDiskDecoProgress).
HandlerFunc(m.queryDiskDecoProgress)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.MarkDecoDiskFixed).
HandlerFunc(m.markDecoDiskFixed)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.CancelDecommissionDisk).
HandlerFunc(m.cancelDecommissionDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryDecommissionDiskDecoFailedDps).
HandlerFunc(m.queryDecommissionDiskDecoFailedDps)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryBadDisks).
HandlerFunc(m.queryBadDisks)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryAllDecommissionDisk).
HandlerFunc(m.queryAllDecommissionDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QueryDisableDisk).
HandlerFunc(m.queryDisableDisk)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetNodeInfo).
HandlerFunc(m.setNodeInfoHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetNodeInfo).
HandlerFunc(m.getNodeInfoHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetIsDomainOn).
HandlerFunc(m.getIsDomainOn)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetAllNodeSetGrpInfo).
HandlerFunc(m.getAllNodeSetGrpInfoHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetNodeSetGrpInfo).
HandlerFunc(m.getNodeSetGrpInfoHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateNodeSetCapcity).
HandlerFunc(m.updateNodeSetCapacityHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateNodeSetId).
HandlerFunc(m.updateNodeSetIdHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateNodeSetNodeSelector).
HandlerFunc(m.updateNodeSetNodeSelector)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateDomainDataUseRatio).
HandlerFunc(m.updateDataUseRatioHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminUpdateZoneExcludeRatio).
HandlerFunc(m.updateZoneExcludeRatioHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetNodeRdOnly).
HandlerFunc(m.setNodeRdOnlyHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetDpRdOnly).
HandlerFunc(m.setDpRdOnlyHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminSetDpDiscard).
HandlerFunc(m.setDpDiscardHandler)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.AdminGetDiscardDp).
HandlerFunc(m.getDiscardDpHandler)
// user management APIs
router.NewRoute().Methods(http.MethodPost).
Path(proto.UserCreate).
HandlerFunc(m.createUser)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UserDelete).
HandlerFunc(m.deleteUser)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UserUpdate).
HandlerFunc(m.updateUser)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UserUpdatePolicy).
HandlerFunc(m.updateUserPolicy)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UserRemovePolicy).
HandlerFunc(m.removeUserPolicy)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UserDeleteVolPolicy).
HandlerFunc(m.deleteUserVolPolicy)
router.NewRoute().Methods(http.MethodGet).
Path(proto.UserGetAKInfo).
HandlerFunc(m.getUserAKInfo)
router.NewRoute().Methods(http.MethodGet).
Path(proto.UserGetInfo).
HandlerFunc(m.getUserInfo)
router.NewRoute().Methods(http.MethodGet).
Path(proto.UserList).
HandlerFunc(m.getAllUsers)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UserTransferVol).
HandlerFunc(m.transferUserVol)
router.NewRoute().Methods(http.MethodGet).
Path(proto.UsersOfVol).
HandlerFunc(m.getUsersOfVol)
// zone management APIs
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UpdateZone).
HandlerFunc(m.updateZone)
router.NewRoute().Methods(http.MethodGet).
Path(proto.GetAllZones).
HandlerFunc(m.listZone)
router.NewRoute().Methods(http.MethodGet).
Path(proto.GetAllNodeSets).
HandlerFunc(m.listNodeSets)
router.NewRoute().Methods(http.MethodGet).
Path(proto.GetNodeSet).
HandlerFunc(m.getNodeSet)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.UpdateNodeSet).
HandlerFunc(m.updateNodeSet)
// Quota
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QuotaCreate).
HandlerFunc(m.CreateQuota)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QuotaUpdate).
HandlerFunc(m.UpdateQuota)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.QuotaDelete).
HandlerFunc(m.DeleteQuota)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QuotaList).
HandlerFunc(m.ListQuota)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QuotaGet).
HandlerFunc(m.GetQuota)
router.NewRoute().Methods(http.MethodGet).
Path(proto.QuotaListAll).
HandlerFunc(m.ListQuotaAll)
// S3 API QoS Manager
router.NewRoute().Methods(http.MethodPut, http.MethodPost).
Path(proto.S3QoSSet).
HandlerFunc(m.S3QosSet)
router.NewRoute().Methods(http.MethodGet, http.MethodPost).
Path(proto.S3QoSGet).
HandlerFunc(m.S3QosGet)
router.NewRoute().Methods(http.MethodDelete, http.MethodPost).
Path(proto.S3QoSDelete).
HandlerFunc(m.S3QosDelete)
}
func (m *Server) registerHandler(router *mux.Router, model string, schema *graphql.Schema) {
introspection.AddIntrospectionToSchema(schema)
gHandler := graphql.HTTPHandler(schema)
router.NewRoute().Name(model).Methods(http.MethodGet, http.MethodPost).Path(model).HandlerFunc(func(writer http.ResponseWriter, request *http.Request) {
userID := request.Header.Get(proto.UserKey)
if userID == "" {
ErrResponse(writer, fmt.Errorf("not found [%s] in header", proto.UserKey))
return
}
if ui, err := m.user.getUserInfo(userID); err != nil {
ErrResponse(writer, fmt.Errorf("user:[%s] not found ", userID))
return
} else {
request = request.WithContext(context.WithValue(request.Context(), proto.UserInfoKey, ui))
}
gHandler.ServeHTTP(writer, request)
})
}
func ErrResponse(w http.ResponseWriter, err error) {
response := struct {
Errors []string `json:"errors"`
}{
Errors: []string{err.Error()},
}
responseJSON, err := json.Marshal(response)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
if w.Header().Get("Content-Type") == "" {
w.Header().Set("Content-Type", "application/json")
}
if _, e := w.Write(responseJSON); e != nil {
log.LogErrorf("send response has err:[%s]", e)
}
}
func (m *Server) newReverseProxy() *httputil.ReverseProxy {
return &httputil.ReverseProxy{Director: func(request *http.Request) {
request.URL.Scheme = "http"
request.URL.Host = m.leaderInfo.addr
}}
}
func (m *Server) proxy(w http.ResponseWriter, r *http.Request) {
m.reverseProxy.ServeHTTP(w, r)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"strconv"
"sync"
"sync/atomic"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/raftstore/raftstore_db"
"github.com/cubefs/cubefs/util/log"
)
// IDAllocator generates and allocates ids
type IDAllocator struct {
dataPartitionID uint64
metaPartitionID uint64
commonID uint64
clientID uint64
clientIDLimit uint64
quotaID uint32
store *raftstore_db.RocksDBStore
partition raftstore.Partition
dpIDLock sync.RWMutex
mpIDLock sync.RWMutex
mnIDLock sync.RWMutex
qaIDLock sync.RWMutex
}
const clientIDBatchCount = 1000
func newIDAllocator(store *raftstore_db.RocksDBStore, partition raftstore.Partition) (alloc *IDAllocator) {
alloc = new(IDAllocator)
alloc.store = store
alloc.partition = partition
return
}
func (alloc *IDAllocator) restore() {
alloc.restoreMaxDataPartitionID()
alloc.restoreMaxMetaPartitionID()
alloc.restoreMaxCommonID()
alloc.restoreMaxQuotaID()
alloc.restoreClientID()
}
func (alloc *IDAllocator) restoreMaxDataPartitionID() {
value, err := alloc.store.Get(maxDataPartitionIDKey)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxDataPartitionID,err:%v ", err.Error()))
}
bytes := value.([]byte)
if len(bytes) == 0 {
alloc.dataPartitionID = 0
return
}
maxDataPartitionID, err := strconv.ParseUint(string(bytes), 10, 64)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxDataPartitionID,err:%v ", err.Error()))
}
alloc.dataPartitionID = maxDataPartitionID
log.LogInfof("action[restoreMaxDataPartitionID] maxDpID[%v]", alloc.dataPartitionID)
}
func (alloc *IDAllocator) restoreMaxMetaPartitionID() {
value, err := alloc.store.Get(maxMetaPartitionIDKey)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxPartitionID,err:%v ", err.Error()))
}
bytes := value.([]byte)
if len(bytes) == 0 {
alloc.metaPartitionID = 0
return
}
maxPartitionID, err := strconv.ParseUint(string(bytes), 10, 64)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxPartitionID,err:%v ", err.Error()))
}
alloc.metaPartitionID = maxPartitionID
log.LogInfof("action[restoreMaxMetaPartitionID] maxMpID[%v]", alloc.metaPartitionID)
}
// The data node, meta node, and node set share the same ID allocator.
func (alloc *IDAllocator) restoreMaxCommonID() {
value, err := alloc.store.Get(maxCommonIDKey)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxCommonID,err:%v ", err.Error()))
}
bytes := value.([]byte)
if len(bytes) == 0 {
alloc.commonID = 0
return
}
maxMetaNodeID, err := strconv.ParseUint(string(bytes), 10, 64)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxCommonID,err:%v ", err.Error()))
}
alloc.commonID = maxMetaNodeID
log.LogInfof("action[restoreMaxCommonID] maxCommonID[%v]", alloc.commonID)
}
func (alloc *IDAllocator) restoreMaxQuotaID() {
value, err := alloc.store.Get(maxQuotaIDKey)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxQuotaID,err:%v ", err.Error()))
}
bytes := value.([]byte)
if len(bytes) == 0 {
alloc.quotaID = 0
return
}
maxQuotaID, err := strconv.ParseUint(string(bytes), 10, 64)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxQuotaID,err:%v ", err.Error()))
}
if maxQuotaID > 0 && maxQuotaID <= math.MaxInt32 {
alloc.quotaID = uint32(maxQuotaID)
} else {
alloc.quotaID = math.MaxInt32
}
log.LogInfof("action[restoreMaxCommonID] maxQuotaID[%v]", alloc.quotaID)
}
func (alloc *IDAllocator) setDataPartitionID(id uint64) {
atomic.StoreUint64(&alloc.dataPartitionID, id)
}
func (alloc *IDAllocator) setMetaPartitionID(id uint64) {
atomic.StoreUint64(&alloc.metaPartitionID, id)
}
func (alloc *IDAllocator) setCommonID(id uint64) {
atomic.StoreUint64(&alloc.commonID, id)
}
func (alloc *IDAllocator) restoreClientID() {
alloc.mpIDLock.Lock()
defer alloc.mpIDLock.Unlock()
value, err := alloc.store.Get(maxClientIDKey)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxClientID,err:%v ", err.Error()))
}
bytes := value.([]byte)
if len(bytes) != 0 {
alloc.clientID, err = strconv.ParseUint(string(bytes), 10, 64)
if err != nil {
panic(fmt.Sprintf("Failed to restore maxClientID,err:%v ", err.Error()))
}
}
alloc.clientIDLimit = alloc.clientID
alloc.clientID += clientIDBatchCount
}
func (alloc *IDAllocator) setQuotaID(id uint32) {
atomic.StoreUint32(&alloc.quotaID, id)
}
func (alloc *IDAllocator) allocateDataPartitionID() (partitionID uint64, err error) {
alloc.dpIDLock.Lock()
defer alloc.dpIDLock.Unlock()
var cmd []byte
metadata := new(RaftCmd)
partitionID = atomic.LoadUint64(&alloc.dataPartitionID) + 1
metadata.Op = opSyncAllocDataPartitionID
metadata.K = maxDataPartitionIDKey
value := strconv.FormatUint(uint64(partitionID), 10)
metadata.V = []byte(value)
cmd, err = metadata.Marshal()
if err != nil {
goto errHandler
}
if _, err = alloc.partition.Submit(cmd); err != nil {
goto errHandler
}
alloc.setDataPartitionID(partitionID)
return
errHandler:
log.LogErrorf("action[allocateDataPartitionID] err:%v", err.Error())
return
}
func (alloc *IDAllocator) allocateMetaPartitionID() (partitionID uint64, err error) {
alloc.mpIDLock.Lock()
defer alloc.mpIDLock.Unlock()
var cmd []byte
metadata := new(RaftCmd)
metadata.Op = opSyncAllocMetaPartitionID
metadata.K = maxMetaPartitionIDKey
partitionID = atomic.LoadUint64(&alloc.metaPartitionID) + 1
value := strconv.FormatUint(uint64(partitionID), 10)
metadata.V = []byte(value)
cmd, err = metadata.Marshal()
if err != nil {
goto errHandler
}
if _, err = alloc.partition.Submit(cmd); err != nil {
goto errHandler
}
alloc.setMetaPartitionID(partitionID)
return
errHandler:
log.LogErrorf("action[allocateMetaPartitionID] err:%v", err.Error())
return
}
func (alloc *IDAllocator) allocateClientID() (clientID uint64, err error) {
alloc.mpIDLock.Lock()
defer alloc.mpIDLock.Unlock()
clientID = alloc.clientID + 1
if alloc.clientIDLimit < clientID {
var cmd []byte
metadata := new(RaftCmd)
metadata.Op = opSyncAllocClientID
metadata.K = maxClientIDKey
// sync clientID - 1
value := strconv.FormatUint(uint64(alloc.clientID), 10)
metadata.V = []byte(value)
cmd, err = metadata.Marshal()
if err != nil {
goto errHandler
}
if _, err = alloc.partition.Submit(cmd); err != nil {
goto errHandler
}
alloc.clientIDLimit = alloc.clientID + clientIDBatchCount
}
alloc.clientID = clientID
return
errHandler:
log.LogErrorf("action[allocateClientID] err:%v", err.Error())
return
}
func (alloc *IDAllocator) allocateCommonID() (id uint64, err error) {
alloc.mnIDLock.Lock()
defer alloc.mnIDLock.Unlock()
var cmd []byte
metadata := new(RaftCmd)
metadata.Op = opSyncAllocCommonID
metadata.K = maxCommonIDKey
id = atomic.LoadUint64(&alloc.commonID) + 1
value := strconv.FormatUint(uint64(id), 10)
metadata.V = []byte(value)
cmd, err = metadata.Marshal()
if err != nil {
goto errHandler
}
if _, err = alloc.partition.Submit(cmd); err != nil {
goto errHandler
}
alloc.setCommonID(id)
return
errHandler:
log.LogErrorf("action[allocateCommonID] err:%v", err.Error())
return
}
func (alloc *IDAllocator) allocateQuotaID() (id uint32, err error) {
alloc.qaIDLock.Lock()
defer alloc.qaIDLock.Unlock()
var cmd []byte
metadata := new(RaftCmd)
metadata.Op = opSyncAllocQuotaID
metadata.K = maxQuotaIDKey
id = atomic.LoadUint32(&alloc.quotaID) + 1
value := strconv.FormatUint(uint64(id), 10)
metadata.V = []byte(value)
cmd, err = metadata.Marshal()
if err != nil {
goto errHandler
}
if _, err = alloc.partition.Submit(cmd); err != nil {
goto errHandler
}
alloc.setQuotaID(id)
return
errHandler:
log.LogErrorf("action[allocateQuotaID] err:%v", err.Error())
return
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"math"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
type lifecycleManager struct {
sync.RWMutex
cluster *Cluster
lcConfigurations map[string]*proto.LcConfiguration
lcNodeStatus *lcNodeStatus
lcRuleTaskStatus *lcRuleTaskStatus
idleLcNodeCh chan struct{}
exitCh chan struct{}
}
func newLifecycleManager() *lifecycleManager {
log.LogInfof("action[newLifecycleManager] construct")
lcMgr := &lifecycleManager{
lcConfigurations: make(map[string]*proto.LcConfiguration),
lcNodeStatus: newLcNodeStatus(),
lcRuleTaskStatus: newLcRuleTaskStatus(),
idleLcNodeCh: make(chan struct{}),
exitCh: make(chan struct{}),
}
return lcMgr
}
func (lcMgr *lifecycleManager) startLcScan() {
// stop if already scanning
if lcMgr.scanning() {
log.LogWarnf("rescheduleScanRoutine: scanning is not completed, lcRuleTaskStatus(%v)", lcMgr.lcRuleTaskStatus)
return
}
tasks := lcMgr.genEnabledRuleTasks()
if len(tasks) <= 0 {
log.LogDebugf("startLcScan: no enabled lifecycle rule task to schedule!")
return
} else {
log.LogDebugf("startLcScan: %v lifecycle rule tasks to schedule!", len(tasks))
}
// start scan init
lcMgr.lcRuleTaskStatus = newLcRuleTaskStatus()
for _, r := range tasks {
lcMgr.lcRuleTaskStatus.ToBeScanned[r.Id] = r
}
go lcMgr.process()
}
// generate tasks for every bucket
func (lcMgr *lifecycleManager) genEnabledRuleTasks() []*proto.RuleTask {
lcMgr.RLock()
defer lcMgr.RUnlock()
tasks := make([]*proto.RuleTask, 0)
for _, v := range lcMgr.lcConfigurations {
ts := v.GenEnabledRuleTasks()
if len(ts) > 0 {
tasks = append(tasks, ts...)
}
}
return tasks
}
func (lcMgr *lifecycleManager) scanning() bool {
log.LogInfof("decide scanning, lcNodeStatus: %v, lcRuleTaskStatus: %v", lcMgr.lcNodeStatus, lcMgr.lcRuleTaskStatus)
if len(lcMgr.lcRuleTaskStatus.ToBeScanned) > 0 {
return true
}
for _, v := range lcMgr.lcRuleTaskStatus.Results {
if v.Done != true && time.Now().Before(v.UpdateTime.Add(time.Minute*10)) {
return true
}
}
for _, c := range lcMgr.lcNodeStatus.WorkingCount {
if c > 0 {
return true
}
}
log.LogInfof("decide scanning, scanning stop!")
return false
}
func (lcMgr *lifecycleManager) process() {
log.LogInfof("lifecycleManager process start, rule num(%v)", len(lcMgr.lcRuleTaskStatus.ToBeScanned))
now := time.Now()
lcMgr.lcRuleTaskStatus.StartTime = &now
for lcMgr.scanning() {
log.LogDebugf("wait idleLcNodeCh... ToBeScanned num(%v)", len(lcMgr.lcRuleTaskStatus.ToBeScanned))
select {
case <-lcMgr.exitCh:
log.LogInfo("exitCh notified, lifecycleManager process exit")
return
case <-lcMgr.idleLcNodeCh:
log.LogDebug("idleLcNodeCh notified")
// ToBeScanned -> Scanning
task := lcMgr.lcRuleTaskStatus.GetOneTask()
if task == nil {
log.LogDebugf("lcRuleTaskStatus.GetOneTask, no task")
continue
}
nodeAddr := lcMgr.lcNodeStatus.GetIdleNode()
if nodeAddr == "" {
log.LogWarn("no idle lcnode, redo task")
lcMgr.lcRuleTaskStatus.RedoTask(task)
continue
}
val, ok := lcMgr.cluster.lcNodes.Load(nodeAddr)
if !ok {
log.LogErrorf("lcNodes.Load, nodeAddr(%v) is not available, redo task", nodeAddr)
lcMgr.lcNodeStatus.RemoveNode(nodeAddr)
lcMgr.lcRuleTaskStatus.RedoTask(task)
continue
}
node := val.(*LcNode)
adminTask := node.createLcScanTask(lcMgr.cluster.masterAddr(), task)
lcMgr.cluster.addLcNodeTasks([]*proto.AdminTask{adminTask})
log.LogDebugf("add lifecycle scan task(%v) to lcnode(%v)", *task, nodeAddr)
}
}
end := time.Now()
lcMgr.lcRuleTaskStatus.EndTime = &end
log.LogInfof("lifecycleManager process finish, lcRuleTaskStatus results(%v)", lcMgr.lcRuleTaskStatus.Results)
}
func (lcMgr *lifecycleManager) notifyIdleLcNode() {
select {
case lcMgr.idleLcNodeCh <- struct{}{}:
log.LogDebug("action[handleLcNodeHeartbeatResp], lifecycleManager scan routine notified!")
default:
log.LogDebug("action[handleLcNodeHeartbeatResp], lifecycleManager skipping notify!")
}
}
func (lcMgr *lifecycleManager) SetS3BucketLifecycle(lcConf *proto.LcConfiguration) error {
lcMgr.Lock()
defer lcMgr.Unlock()
lcMgr.lcConfigurations[lcConf.VolName] = lcConf
return nil
}
func (lcMgr *lifecycleManager) GetS3BucketLifecycle(VolName string) (lcConf *proto.LcConfiguration) {
lcMgr.RLock()
defer lcMgr.RUnlock()
var ok bool
lcConf, ok = lcMgr.lcConfigurations[VolName]
if !ok {
return nil
}
return lcConf
}
func (lcMgr *lifecycleManager) DelS3BucketLifecycle(VolName string) {
lcMgr.Lock()
defer lcMgr.Unlock()
delete(lcMgr.lcConfigurations, VolName)
}
//-----------------------------------------------
type OpLcNode interface {
GetIdleNode() (nodeAddr string)
RemoveNode(nodeAddr string)
UpdateNode(nodeAddr string, count int)
}
// update status by heartbeat
type lcNodeStatus struct {
sync.RWMutex
WorkingCount map[string]int //ip:count, number of tasks being processed on this node
}
func newLcNodeStatus() *lcNodeStatus {
return &lcNodeStatus{
WorkingCount: make(map[string]int),
}
}
func (ns *lcNodeStatus) GetIdleNode() (nodeAddr string) {
ns.Lock()
defer ns.Unlock()
if len(ns.WorkingCount) == 0 {
return
}
min := math.MaxInt
for n, c := range ns.WorkingCount {
if c < min {
nodeAddr = n
min = c
}
if c == 0 {
break
}
}
ns.WorkingCount[nodeAddr]++
return
}
func (ns *lcNodeStatus) RemoveNode(nodeAddr string) {
ns.Lock()
defer ns.Unlock()
delete(ns.WorkingCount, nodeAddr)
return
}
func (ns *lcNodeStatus) UpdateNode(nodeAddr string, count int) {
ns.Lock()
defer ns.Unlock()
ns.WorkingCount[nodeAddr] = count
return
}
// -----------------------------------------------
type lcRuleTaskStatus struct {
sync.RWMutex
ToBeScanned map[string]*proto.RuleTask
Results map[string]*proto.LcNodeRuleTaskResponse
StartTime *time.Time
EndTime *time.Time
}
func newLcRuleTaskStatus() *lcRuleTaskStatus {
return &lcRuleTaskStatus{
ToBeScanned: make(map[string]*proto.RuleTask),
Results: make(map[string]*proto.LcNodeRuleTaskResponse),
}
}
func (rs *lcRuleTaskStatus) GetOneTask() (task *proto.RuleTask) {
rs.Lock()
defer rs.Unlock()
if len(rs.ToBeScanned) == 0 {
return
}
for _, t := range rs.ToBeScanned {
task = t
break
}
delete(rs.ToBeScanned, task.Id)
return
}
func (rs *lcRuleTaskStatus) RedoTask(task *proto.RuleTask) {
rs.Lock()
defer rs.Unlock()
if task == nil {
return
}
rs.ToBeScanned[task.Id] = task
}
func (rs *lcRuleTaskStatus) AddResult(resp *proto.LcNodeRuleTaskResponse) {
rs.Lock()
defer rs.Unlock()
rs.Results[resp.ID] = resp
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
type LcNode struct {
ID uint64
Addr string
ReportTime time.Time
IsActive bool
TaskManager *AdminTaskManager
sync.RWMutex
}
func newLcNode(addr, clusterID string) (lcNode *LcNode) {
lcNode = new(LcNode)
lcNode.Addr = addr
lcNode.IsActive = true
lcNode.ReportTime = time.Now()
lcNode.TaskManager = newAdminTaskManager(lcNode.Addr, clusterID)
return
}
func (lcNode *LcNode) clean() {
lcNode.TaskManager.exitCh <- struct{}{}
}
func (lcNode *LcNode) checkLiveness() {
lcNode.Lock()
defer lcNode.Unlock()
log.LogInfof("action[checkLiveness] lcnode[%v, %v, %v] report time[%v], since report time[%v], need gap[%v]",
lcNode.ID, lcNode.Addr, lcNode.IsActive, lcNode.ReportTime, time.Since(lcNode.ReportTime), time.Second*time.Duration(defaultNodeTimeOutSec))
if time.Since(lcNode.ReportTime) > time.Second*time.Duration(defaultNodeTimeOutSec) {
lcNode.IsActive = false
}
return
}
func (lcNode *LcNode) createHeartbeatTask(masterAddr string) (task *proto.AdminTask) {
request := &proto.HeartBeatRequest{
CurrTime: time.Now().Unix(),
MasterAddr: masterAddr,
}
task = proto.NewAdminTask(proto.OpLcNodeHeartbeat, lcNode.Addr, request)
return
}
func (lcNode *LcNode) createLcScanTask(masterAddr string, ruleTask *proto.RuleTask) (task *proto.AdminTask) {
request := &proto.LcNodeRuleTaskRequest{
MasterAddr: masterAddr,
LcNodeAddr: lcNode.Addr,
Task: ruleTask,
}
task = proto.NewAdminTaskEx(proto.OpLcNodeScan, lcNode.Addr, request, ruleTask.Id)
return
}
func (lcNode *LcNode) createSnapshotVerDelTask(masterAddr string, sTask *proto.SnapshotVerDelTask) (task *proto.AdminTask) {
request := &proto.SnapshotVerDelTaskRequest{
MasterAddr: masterAddr,
LcNodeAddr: lcNode.Addr,
Task: sTask,
}
task = proto.NewAdminTaskEx(proto.OpLcNodeSnapshotVerDel, lcNode.Addr, request, request.Task.Id)
return
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
func (c *Cluster) handleLcNodeTaskResponse(nodeAddr string, task *proto.AdminTask) {
if task == nil {
log.LogInfof("lc action[handleLcNodeTaskResponse] receive addr[%v] task response, but task is nil", nodeAddr)
return
}
log.LogInfof("lc action[handleLcNodeTaskResponse] receive addr[%v] task: %v", nodeAddr, task.ToString())
var (
err error
lcNode *LcNode
)
if lcNode, err = c.lcNode(nodeAddr); err != nil {
goto errHandler
}
lcNode.TaskManager.DelTask(task)
if err = unmarshalTaskResponse(task); err != nil {
goto errHandler
}
switch task.OpCode {
case proto.OpLcNodeHeartbeat:
response := task.Response.(*proto.LcNodeHeartbeatResponse)
err = c.handleLcNodeHeartbeatResp(task.OperatorAddr, response)
case proto.OpLcNodeScan:
response := task.Response.(*proto.LcNodeRuleTaskResponse)
err = c.handleLcNodeLcScanResp(task.OperatorAddr, response)
case proto.OpLcNodeSnapshotVerDel:
response := task.Response.(*proto.SnapshotVerDelTaskResponse)
err = c.handleLcNodeSnapshotScanResp(task.OperatorAddr, response)
default:
err = fmt.Errorf(fmt.Sprintf("lc unknown operate code %v", task.OpCode))
goto errHandler
}
if err != nil {
goto errHandler
}
return
errHandler:
log.LogWarnf("lc handleLcNodeTaskResponse failed, task: %v, err: %v", task.ToString(), err)
return
}
func (c *Cluster) handleLcNodeHeartbeatResp(nodeAddr string, resp *proto.LcNodeHeartbeatResponse) (err error) {
var lcNode *LcNode
log.LogDebugf("action[handleLcNodeHeartbeatResp] clusterID[%v] receive lcNode[%v] heartbeat", c.Name, nodeAddr)
if resp.Status != proto.TaskSucceeds {
Warn(c.Name, fmt.Sprintf("action[handleLcNodeHeartbeatResp] clusterID[%v] lcNode[%v] heartbeat task failed, err[%v]",
c.Name, nodeAddr, resp.Result))
return
}
if lcNode, err = c.lcNode(nodeAddr); err != nil {
log.LogErrorf("action[handleLcNodeHeartbeatResp], lcNode[%v], heartbeat error: %v", nodeAddr, err.Error())
return
}
lcNode.Lock()
lcNode.IsActive = true
lcNode.ReportTime = time.Now()
lcNode.Unlock()
// update lcNodeStatus
log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v], LcScanningTasks[%v], SnapshotScanningTasks[%v]", nodeAddr, len(resp.LcScanningTasks), len(resp.SnapshotScanningTasks))
c.lcMgr.lcNodeStatus.UpdateNode(nodeAddr, len(resp.LcScanningTasks))
c.snapshotMgr.lcNodeStatus.UpdateNode(nodeAddr, len(resp.SnapshotScanningTasks))
// handle LcScanningTasks
for _, taskRsp := range resp.LcScanningTasks {
c.lcMgr.lcRuleTaskStatus.Lock()
// avoid updating TaskResults incorrectly when received handleLcNodeLcScanResp first and then handleLcNodeHeartbeatResp
if c.lcMgr.lcRuleTaskStatus.Results[taskRsp.ID] != nil && c.lcMgr.lcRuleTaskStatus.Results[taskRsp.ID].Done {
log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v] task[%v] already done", nodeAddr, taskRsp.ID)
} else {
t := time.Now()
taskRsp.UpdateTime = &t
c.lcMgr.lcRuleTaskStatus.Results[taskRsp.ID] = taskRsp
}
c.lcMgr.lcRuleTaskStatus.Unlock()
log.LogDebugf("action[handleLcNodeHeartbeatResp], lcNode[%v] taskRsp: %v", nodeAddr, taskRsp)
}
if len(resp.LcScanningTasks) < resp.LcTaskCountLimit {
log.LogInfof("action[handleLcNodeHeartbeatResp], notify idle lcNode[%v], now LcScanningTasks[%v]", nodeAddr, len(resp.LcScanningTasks))
c.lcMgr.notifyIdleLcNode()
}
// handle SnapshotScanningTasks
for _, taskRsp := range resp.SnapshotScanningTasks {
c.snapshotMgr.lcSnapshotTaskStatus.Lock()
// avoid updating TaskResults incorrectly when received handleLcNodeLcScanResp first and then handleLcNodeHeartbeatResp
if c.snapshotMgr.lcSnapshotTaskStatus.TaskResults[taskRsp.ID] != nil && c.snapshotMgr.lcSnapshotTaskStatus.TaskResults[taskRsp.ID].Done {
log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v] snapshot task[%v] already done", nodeAddr, taskRsp.ID)
} else {
t := time.Now()
taskRsp.UpdateTime = &t
c.snapshotMgr.lcSnapshotTaskStatus.TaskResults[taskRsp.ID] = taskRsp
}
c.snapshotMgr.lcSnapshotTaskStatus.Unlock()
log.LogDebugf("action[handleLcNodeHeartbeatResp], lcNode[%v] snapshot taskRsp: %v", nodeAddr, taskRsp)
}
if len(resp.SnapshotScanningTasks) < resp.LcTaskCountLimit {
n := resp.LcTaskCountLimit - len(resp.SnapshotScanningTasks)
log.LogInfof("action[handleLcNodeHeartbeatResp], notify idle lcNode[%v], now SnapshotScanningTasks[%v], notify times[%v]", nodeAddr, len(resp.SnapshotScanningTasks), n)
for i := 0; i < n; i++ {
c.snapshotMgr.notifyIdleLcNode()
}
}
log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v], heartbeat success", nodeAddr)
return
}
func (c *Cluster) handleLcNodeLcScanResp(nodeAddr string, resp *proto.LcNodeRuleTaskResponse) (err error) {
log.LogDebugf("action[handleLcNodeLcScanResp] lcNode[%v] task[%v] Enter", nodeAddr, resp.ID)
defer func() {
log.LogDebugf("action[handleLcNodeLcScanResp] lcNode[%v] task[%v] Exit", nodeAddr, resp.ID)
}()
switch resp.Status {
case proto.TaskFailed:
log.LogWarnf("action[handleLcNodeLcScanResp] scanning failed, resp(%v), no redo", resp)
return
case proto.TaskSucceeds:
c.lcMgr.lcRuleTaskStatus.AddResult(resp)
log.LogInfof("action[handleLcNodeLcScanResp] scanning completed, resp(%v)", resp)
return
default:
log.LogInfof("action[handleLcNodeLcScanResp] scanning received, resp(%v)", resp)
}
return
}
func (c *Cluster) handleLcNodeSnapshotScanResp(nodeAddr string, resp *proto.SnapshotVerDelTaskResponse) (err error) {
log.LogDebugf("action[handleLcNodeSnapshotScanResp] lcNode[%v] task[%v] Enter", nodeAddr, resp.ID)
defer func() {
log.LogDebugf("action[handleLcNodeSnapshotScanResp] lcNode[%v] task[%v] Exit", nodeAddr, resp.ID)
}()
switch resp.Status {
case proto.TaskFailed:
c.snapshotMgr.lcSnapshotTaskStatus.RedoTask(resp.SnapshotVerDelTask)
log.LogErrorf("action[handleLcNodeSnapshotScanResp] scanning failed, resp(%v), redo", resp)
return
case proto.TaskSucceeds:
// 1.mark done for VersionMgr
var vol *Vol
vol, err = c.getVol(resp.VolName)
if err != nil {
log.LogErrorf("action[handleLcNodeSnapshotScanResp] snapshot task(%v) scanning completed by %v, results(%v), volume(%v) is not found",
resp.ID, nodeAddr, resp, resp.VolName)
} else {
_ = vol.VersionMgr.DelVer(resp.VerSeq)
}
// 2. mark done for snapshotMgr
c.snapshotMgr.lcSnapshotTaskStatus.AddResult(resp)
log.LogInfof("action[handleLcNodeSnapshotScanResp] scanning completed, resp(%v)", resp)
return
default:
log.LogInfof("action[handleLcNodeSnapshotScanResp] scanning received, resp(%v)", resp)
}
return
}
package master
import (
"encoding/json"
"fmt"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
type UidSpaceManager struct {
volName string
mpSpaceMetrics map[uint64][]*proto.UidReportSpaceInfo
uidInfo map[uint32]*proto.UidSpaceInfo
c *Cluster
vol *Vol
sync.RWMutex
}
type UidSpaceFsm struct {
UidSpaceArr []*proto.UidSpaceInfo
}
func (vol *Vol) initUidSpaceManager(c *Cluster) {
vol.uidSpaceManager = &UidSpaceManager{
c: c,
vol: vol,
volName: vol.Name,
mpSpaceMetrics: make(map[uint64][]*proto.UidReportSpaceInfo),
uidInfo: make(map[uint32]*proto.UidSpaceInfo),
}
}
func (uMgr *UidSpaceManager) addUid(uid uint32, size uint64) bool {
uMgr.Lock()
uMgr.uidInfo[uid] = &proto.UidSpaceInfo{
LimitSize: size,
VolName: uMgr.volName,
Uid: uid,
Enabled: true,
}
uMgr.persist()
uMgr.Unlock()
uMgr.listAll()
return true
}
func (uMgr *UidSpaceManager) removeUid(uid uint32) bool {
uMgr.Lock()
defer uMgr.Unlock()
if _, ok := uMgr.uidInfo[uid]; !ok {
log.LogErrorf("UidSpaceManager.vol %v del %v failed", uMgr.volName, uid)
return true
}
uMgr.uidInfo[uid].Enabled = false
uMgr.uidInfo[uid].Limited = false
uMgr.persist()
log.LogDebugf("UidSpaceManager.vol %v del %v success", uMgr.volName, uid)
return true
}
func (uMgr *UidSpaceManager) checkUid(uid uint32) (ok bool, uidInfo *proto.UidSpaceInfo) {
uMgr.RLock()
defer uMgr.RUnlock()
uidInfo, ok = uMgr.uidInfo[uid]
return
}
func (uMgr *UidSpaceManager) listAll() (rsp []*proto.UidSpaceInfo) {
uMgr.RLock()
defer uMgr.RUnlock()
log.LogDebugf("UidSpaceManager. listAll vol %v, info %v", uMgr.volName, len(uMgr.uidInfo))
for _, t := range uMgr.uidInfo {
log.LogDebugf("UidSpaceManager. listAll vol %v, uid %v, info %v", t.VolName, t.Uid, t)
rsp = append(rsp, t)
}
return
}
func (uMgr *UidSpaceManager) persist() (err error) {
log.LogDebugf("vol %v UidSpaceManager persist", uMgr.volName)
var uidFsm UidSpaceFsm
for _, t := range uMgr.uidInfo {
uidFsm.UidSpaceArr = append(uidFsm.UidSpaceArr, t)
}
var val []byte
if val, err = json.Marshal(uidFsm); err != nil {
log.LogErrorf("UidSpaceManager vol %v uid persist error %v", uMgr.vol.Name, err)
return
}
if err = uMgr.c.syncUidSpaceList(uMgr.vol, val); err != nil {
log.LogErrorf("UidSpaceManager vol %v uid persist syncUidList error %v", uMgr.vol.Name, err)
return
}
return
}
func (uMgr *UidSpaceManager) load(c *Cluster, val []byte) (err error) {
log.LogDebugf("vol %v UidSpaceManager load", uMgr.volName)
uMgr.c = c
uidFsm := &UidSpaceFsm{}
if err = json.Unmarshal(val, uidFsm); err != nil {
log.LogErrorf("UidSpaceManager vol %v Unmarshal error %v", uMgr.volName, err)
return
}
for _, info := range uidFsm.UidSpaceArr {
uMgr.uidInfo[info.Uid] = info
log.LogDebugf("vol %v uid %v load usedSize %v limit %v enabled %v", uMgr.volName, info.Uid, info.UsedSize, info.LimitSize, info.Limited)
}
return
}
func (uMgr *UidSpaceManager) getSpaceOp() (rsp []*proto.UidSpaceInfo) {
uMgr.RLock()
defer uMgr.RUnlock()
for _, info := range uMgr.uidInfo {
rsp = append(rsp, info)
log.LogDebugf("getSpaceOp. vol %v uid %v enabled %v", info.VolName, info.Uid, info.Limited)
}
return
}
func (uMgr *UidSpaceManager) volUidUpdate(report *proto.MetaPartitionReport) {
if !report.IsLeader {
return
}
uMgr.Lock()
defer uMgr.Unlock()
id := report.PartitionID
uMgr.mpSpaceMetrics[id] = report.UidInfo
log.LogDebugf("vol %v volUidUpdate.mpID %v set uid %v. uid list size %v", uMgr.volName, id, report.UidInfo, len(uMgr.uidInfo))
for _, info := range uMgr.uidInfo {
info.UsedSize = 0
}
uidInfo := make(map[uint32]*proto.UidSpaceInfo)
for mpId, info := range uMgr.mpSpaceMetrics {
log.LogDebugf("vol %v volUidUpdate. reCalc mpId %v info %v", uMgr.volName, mpId, len(info))
for _, space := range info {
if _, ok := uMgr.uidInfo[space.Uid]; !ok {
log.LogDebugf("vol %v volUidUpdate.uid %v not found", uMgr.volName, space.Uid)
uMgr.uidInfo[space.Uid] = &proto.UidSpaceInfo{
VolName: uMgr.volName,
Uid: space.Uid,
CTime: time.Now().Unix(),
}
}
if _, ok := uidInfo[space.Uid]; !ok {
uidInfo[space.Uid] = &(*uMgr.uidInfo[space.Uid])
}
log.LogDebugf("volUidUpdate.vol %v uid %v from mpId %v useSize %v add %v", uMgr.vol, space.Uid, mpId, uidInfo[space.Uid].UsedSize, space.Size)
uidInfo[space.Uid].UsedSize += space.Size
if !uidInfo[space.Uid].Enabled {
uidInfo[space.Uid].Limited = false
continue
}
if uidInfo[space.Uid].UsedSize > uMgr.uidInfo[space.Uid].LimitSize {
uidInfo[space.Uid].Limited = true
log.LogWarnf("volUidUpdate.vol %v uid %v from mpId %v useSize %v add %v", uMgr.vol, space.Uid, mpId, uidInfo[space.Uid].UsedSize, space.Size)
} else {
uidInfo[space.Uid].Limited = false
log.LogWarnf("volUidUpdate.vol %v uid %v from mpId %v useSize %v add %v", uMgr.vol, space.Uid, mpId, uidInfo[space.Uid].UsedSize, space.Size)
}
}
}
log.LogDebugf("vol %v volUidUpdate.mpID %v set uid %v. uid list size %v", uMgr.volName, id, report.UidInfo, len(uMgr.uidInfo))
for _, info := range uidInfo {
if _, ok := uMgr.uidInfo[info.Uid]; !ok {
log.LogErrorf("volUidUpdate.uid %v not found", info.Uid)
continue
}
uMgr.uidInfo[info.Uid] = info
}
for _, info := range uMgr.uidInfo {
if info.UsedSize == 0 {
info.Limited = false
}
}
log.LogDebugf("volUidUpdate.mpID %v set uid %v. uid list size %v", id, report.UidInfo, len(uMgr.uidInfo))
}
type ServerFactorLimit struct {
Name string
Type uint32
Total uint64
Buffer uint64 // flowbuffer add with preallocate buffer equal with flowtotal
CliUsed uint64
CliNeed uint64
Allocated uint64
NeedAfterAlloc uint64
magnify uint32 // for client allocation need magnify
LimitRate float32
LastMagnify uint64
requestCh chan interface{}
done chan interface{}
qosManager *QosCtrlManager
}
type ClientReportOutput struct {
ID uint64
FactorMap map[uint32]*proto.ClientLimitInfo
Host string
Status uint8
}
type LimitOutput struct {
ID uint64
Enable bool
ReqPeriod uint32
HitTriggerCnt uint8
FactorMap map[uint32]*proto.ClientLimitInfo
}
type ClientInfoOutput struct {
Cli *ClientReportOutput
Assign *LimitOutput
Time time.Time
ID uint64
Host string
}
type ClientInfoMgr struct {
Cli *proto.ClientReportLimitInfo
Assign *proto.LimitRsp2Client
Time time.Time
ID uint64
Host string
}
type qosRequestArgs struct {
clientID uint64
factorType uint32
clientReq *proto.ClientLimitInfo
lastClientInfo *proto.ClientLimitInfo
assignInfo *proto.ClientLimitInfo
rsp2Client *proto.ClientLimitInfo
wg *sync.WaitGroup
}
type QosCtrlManager struct {
cliInfoMgrMap map[uint64]*ClientInfoMgr // clientid->client_reportinfo&&assign_limitinfo
serverFactorLimitMap map[uint32]*ServerFactorLimit // vol qos data for iops w/r and flow w/r
defaultClientCnt uint32
qosEnable bool
ClientReqPeriod uint32
ClientHitTriggerCnt uint32
vol *Vol
sync.RWMutex
}
func (qosManager *QosCtrlManager) volUpdateMagnify(magnifyArgs *qosArgs) {
defer qosManager.Unlock()
qosManager.Lock()
log.LogWarnf("action[volUpdateMagnify] vol %v try set magnify iopsRVal[%v],iopsWVal[%v],flowRVal[%v],flowWVal[%v]",
qosManager.vol.Name, magnifyArgs.iopsRVal, magnifyArgs.iopsWVal, magnifyArgs.flowRVal, magnifyArgs.flowWVal)
arrMagnify := [4]uint64{magnifyArgs.iopsRVal, magnifyArgs.iopsWVal, magnifyArgs.flowRVal, magnifyArgs.flowWVal}
for i := proto.IopsReadType; i <= proto.FlowWriteType; i++ {
magnify := qosManager.serverFactorLimitMap[i].magnify
if uint64(magnify) != arrMagnify[i-1] && arrMagnify[i-1] > 0 {
qosManager.serverFactorLimitMap[i].magnify = uint32(arrMagnify[i-1])
log.LogWarnf("action[volUpdateMagnify] vol %v after update type [%v] magnify [%v] to [%v]",
qosManager.vol.Name, proto.QosTypeString(i), magnify, arrMagnify[i-1])
}
}
}
func (qosManager *QosCtrlManager) volUpdateLimit(limitArgs *qosArgs) {
defer qosManager.Unlock()
qosManager.Lock()
log.LogWarnf("action[volUpdateLimit] vol %v try set limit iopsrlimit[%v],iopswlimit[%v],flowrlimit[%v],flowwlimit[%v]",
qosManager.vol.Name, limitArgs.iopsRVal, limitArgs.iopsWVal, limitArgs.flowRVal, limitArgs.flowWVal)
//if limitArgs.iopsWVal != 0 {
// qosManager.serverFactorLimitMap[proto.IopsWriteType].Total = limitArgs.iopsWVal
// qosManager.serverFactorLimitMap[proto.IopsWriteType].LastMagnify = 0
//}
//if limitArgs.iopsRVal != 0 {
// qosManager.serverFactorLimitMap[proto.IopsReadType].Total = limitArgs.iopsRVal
// qosManager.serverFactorLimitMap[proto.IopsWriteType].LastMagnify = 0
//}
if limitArgs.flowWVal != 0 {
qosManager.serverFactorLimitMap[proto.FlowWriteType].Total = limitArgs.flowWVal
qosManager.serverFactorLimitMap[proto.FlowWriteType].LastMagnify = 0
qosManager.serverFactorLimitMap[proto.FlowWriteType].Buffer = limitArgs.flowWVal
}
if limitArgs.flowRVal != 0 {
qosManager.serverFactorLimitMap[proto.FlowReadType].Total = limitArgs.flowRVal
qosManager.serverFactorLimitMap[proto.FlowReadType].LastMagnify = 0
qosManager.serverFactorLimitMap[proto.FlowReadType].Buffer = limitArgs.flowRVal
}
for i := proto.IopsReadType; i <= proto.FlowWriteType; i++ {
limitf := qosManager.serverFactorLimitMap[i]
log.LogWarnf("action[volUpdateLimit] vol [%v] after set type [%v] [%v,%v,%v,%v]",
qosManager.vol.Name, proto.QosTypeString(i), limitf.Allocated, limitf.NeedAfterAlloc, limitf.Total, limitf.Buffer)
}
}
func (qosManager *QosCtrlManager) getQosMagnify(factorTYpe uint32) uint32 {
return qosManager.serverFactorLimitMap[factorTYpe].magnify
}
func (qosManager *QosCtrlManager) getQosLimit(factorTYpe uint32) uint64 {
return qosManager.serverFactorLimitMap[factorTYpe].Total
}
func (qosManager *QosCtrlManager) initClientQosInfo(clientID uint64, host string) (limitRsp2Client *proto.LimitRsp2Client, err error) {
log.QosWriteDebugf("action[initClientQosInfo] vol %v clientID %v Host %v", qosManager.vol.Name, clientID, host)
clientInitInfo := proto.NewClientReportLimitInfo()
cliCnt := qosManager.defaultClientCnt
if cliCnt <= proto.QosDefaultClientCnt {
cliCnt = proto.QosDefaultClientCnt
}
if len(qosManager.cliInfoMgrMap) > int(cliCnt) {
cliCnt = uint32(len(qosManager.cliInfoMgrMap))
}
limitRsp2Client = proto.NewLimitRsp2Client()
limitRsp2Client.ID = clientID
limitRsp2Client.Enable = qosManager.qosEnable
factorType := proto.IopsReadType
defer qosManager.Unlock()
qosManager.Lock()
for factorType <= proto.FlowWriteType {
var initLimit uint64
serverLimit := qosManager.serverFactorLimitMap[factorType]
if qosManager.qosEnable {
initLimit = serverLimit.Total / uint64(cliCnt)
if serverLimit.Buffer > initLimit {
serverLimit.Buffer -= initLimit
serverLimit.Allocated += initLimit
} else {
initLimit = serverLimit.Buffer
serverLimit.Allocated += initLimit
serverLimit.Buffer = 0
}
if factorType == proto.FlowWriteType || factorType == proto.FlowReadType {
if initLimit > 1*util.GB/8 {
initLimit = 1 * util.GB / 8
}
} else {
if initLimit > 200 {
initLimit = 200
}
}
}
clientInitInfo.FactorMap[factorType] = &proto.ClientLimitInfo{
UsedLimit: initLimit,
UsedBuffer: 0,
Used: 0,
Need: 0,
}
limitRsp2Client.Magnify[factorType] = serverLimit.magnify
limitRsp2Client.FactorMap[factorType] = clientInitInfo.FactorMap[factorType]
log.QosWriteDebugf("action[initClientQosInfo] vol [%v] clientID [%v] factorType [%v] init client info and set limitRsp2Client [%v]"+
"server total[%v] used [%v] buffer [%v]",
qosManager.vol.Name, clientID, proto.QosTypeString(factorType),
initLimit, serverLimit.Total, serverLimit.Allocated, serverLimit.Buffer)
factorType++
}
qosManager.cliInfoMgrMap[clientID] = &ClientInfoMgr{
Cli: clientInitInfo,
Assign: limitRsp2Client,
Time: time.Now(),
ID: clientID,
Host: host,
}
log.QosWriteDebugf("action[initClientQosInfo] vol [%v] clientID [%v] Assign [%v]", qosManager.vol.Name, clientID, limitRsp2Client)
return
}
func (serverLimit *ServerFactorLimit) String() string {
return fmt.Sprintf("serverLimit {total:[%v],alloc:(allocated:[%v],need:[%v],buffer:[%v]),limit:(limitrate:[%v], magnify:[%v]),client sum {used:[%v], need:[%v]}}",
serverLimit.Total, serverLimit.Allocated, serverLimit.NeedAfterAlloc, serverLimit.Buffer,
serverLimit.LimitRate, serverLimit.LastMagnify,
serverLimit.CliUsed, serverLimit.CliNeed)
}
func (serverLimit *ServerFactorLimit) getDstLimit(factorType uint32, used, need uint64) (dstLimit uint64) {
if factorType == proto.FlowWriteType || factorType == proto.FlowReadType {
if need > used {
need = used
}
if (need + used) < 10*util.MB/8 {
dstLimit = uint64(float64(need+used) * 2)
} else if (need + used) < 50*util.MB/8 {
dstLimit = uint64(float64(need+used) * 1.5)
} else if (need + used) < 100*util.MB/8 {
dstLimit = uint64(float64(need+used) * 1.2)
} else if (need + used) < 1*util.GB/8 {
dstLimit = uint64(float64(need+used) * 1.1)
} else {
dstLimit = uint64(float64(need+used) + 1*util.GB/8)
}
} else {
if (need + used) < 100 {
dstLimit = uint64(float64(need+used) * 2)
} else if (need + used) < 500 {
dstLimit = uint64(float64(need+used) * 1.5)
} else if (need + used) < 1000 {
dstLimit = uint64(float64(need+used) * 1.2)
} else if (need + used) < 5000 {
dstLimit = uint64(float64(need+used) * 1.2)
} else {
dstLimit = uint64(float64(need+used) + 1000)
}
}
return
}
func (serverLimit *ServerFactorLimit) dispatch() {
for {
select {
case request := <-serverLimit.requestCh:
serverLimit.updateLimitFactor(request)
case <-serverLimit.done:
log.LogErrorf("done ServerFactorLimit type (%v)", serverLimit.Type)
return
}
}
}
// handle client request and rsp with much more if buffer is enough according rules of allocate
func (serverLimit *ServerFactorLimit) updateLimitFactor(req interface{}) {
request := req.(*qosRequestArgs)
clientID := request.clientID
factorType := request.factorType
clientReq := request.clientReq
assignInfo := request.assignInfo
rsp2Client := request.rsp2Client
lastClientInfo := request.lastClientInfo
log.QosWriteDebugf("action[updateLimitFactor] vol [%v] clientID [%v] type [%v],client report [%v,%v,%v,%v] last client report [%v,%v,%v,%v] periodically cal Assign [%v,%v]",
serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType),
clientReq.Used, clientReq.Need, clientReq.UsedLimit, clientReq.UsedBuffer,
lastClientInfo.Used, lastClientInfo.Need, lastClientInfo.UsedLimit, lastClientInfo.UsedBuffer,
assignInfo.UsedLimit, assignInfo.UsedBuffer)
rsp2Client.UsedLimit = assignInfo.UsedLimit
rsp2Client.UsedBuffer = assignInfo.UsedBuffer
// flow limit and buffer not enough,client need more
if (clientReq.Need + clientReq.Used) > (assignInfo.UsedLimit + assignInfo.UsedBuffer) {
log.QosWriteDebugf("action[updateLimitFactor] vol [%v] clientID [%v] type [%v], need [%v] used [%v], used limit [%v]",
serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType), clientReq.Need, clientReq.Used, clientReq.UsedLimit)
dstLimit := serverLimit.getDstLimit(factorType, clientReq.Used, clientReq.Need)
// Assign already allocated the buffer for client
if dstLimit > assignInfo.UsedLimit+assignInfo.UsedBuffer {
additionBuffer := dstLimit - assignInfo.UsedLimit - assignInfo.UsedBuffer
// if buffer is available then balance must not effect, try use buffer as possible as can
if serverLimit.Buffer > 0 {
log.QosWriteDebugf("action[updateLimitFactor] vol [%v] clientID [%v] type [%v] client need more buffer [%v] serverlimit buffer [%v] used [%v]",
serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType),
additionBuffer, serverLimit.Buffer, serverLimit.Allocated)
// calc dst buffer for client to expand
// ignore the case of s.used be zero. used should large then 0 because dstLimit isn't zero and be part of s.used
var dstUsedBuffer uint64
if serverLimit.Allocated != 0 {
dstUsedBuffer = uint64(float64(dstLimit) * (float64(serverLimit.Buffer) / float64(serverLimit.Allocated)) * 0.5)
if dstUsedBuffer > dstLimit {
dstUsedBuffer = dstLimit
}
} else {
dstUsedBuffer = dstLimit
}
if assignInfo.UsedBuffer < dstUsedBuffer {
additionBuffer = dstUsedBuffer - assignInfo.UsedBuffer
if additionBuffer > serverLimit.Buffer {
rsp2Client.UsedBuffer += serverLimit.Buffer
assignInfo.UsedBuffer = rsp2Client.UsedBuffer
serverLimit.Allocated += serverLimit.Buffer
serverLimit.Buffer = 0
} else {
rsp2Client.UsedBuffer = dstUsedBuffer
assignInfo.UsedBuffer = dstUsedBuffer
serverLimit.Buffer -= additionBuffer
serverLimit.Allocated += additionBuffer
}
}
}
}
}
log.QosWriteDebugf("action[updateLimitFactor] vol [%v] [clientID [%v] type [%v] rsp2Client.UsedLimit [%v], UsedBuffer [%v]",
serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType), rsp2Client.UsedLimit, rsp2Client.UsedBuffer)
request.wg.Done()
}
func (qosManager *QosCtrlManager) init(cluster *Cluster, host string) (limit *proto.LimitRsp2Client, err error) {
log.QosWriteDebugf("action[qosManage.init] vol [%v] Host %v", qosManager.vol.Name, host)
var id uint64
if id, err = cluster.idAlloc.allocateClientID(); err == nil {
return qosManager.initClientQosInfo(id, host)
}
return
}
func (qosManager *QosCtrlManager) HandleClientQosReq(reqClientInfo *proto.ClientReportLimitInfo, clientID uint64) (limitRsp *proto.LimitRsp2Client, err error) {
log.QosWriteDebugf("action[HandleClientQosReq] vol [%v] reqClientInfo from [%v], enable [%v]",
qosManager.vol.Name, clientID, qosManager.qosEnable)
qosManager.RLock()
clientInfo, lastExist := qosManager.cliInfoMgrMap[clientID]
if !lastExist || reqClientInfo == nil {
qosManager.RUnlock()
log.LogWarnf("action[HandleClientQosReq] vol [%v] id [%v] addr [%v] not exist", qosManager.vol.Name, clientID, reqClientInfo.Host)
return qosManager.initClientQosInfo(clientID, reqClientInfo.Host)
}
qosManager.RUnlock()
limitRsp = proto.NewLimitRsp2Client()
limitRsp.Enable = qosManager.qosEnable
limitRsp.ID = reqClientInfo.ID
limitRsp.ReqPeriod = qosManager.ClientReqPeriod
limitRsp.HitTriggerCnt = uint8(qosManager.ClientHitTriggerCnt)
if !qosManager.qosEnable {
clientInfo.Cli = reqClientInfo
limitRsp.FactorMap = reqClientInfo.FactorMap
clientInfo.Assign = limitRsp
clientInfo.Time = time.Now()
for i := proto.IopsReadType; i <= proto.FlowWriteType; i++ {
reqClientInfo.FactorMap[i].UsedLimit = reqClientInfo.FactorMap[i].Used
reqClientInfo.FactorMap[i].UsedBuffer = reqClientInfo.FactorMap[i].Need
log.QosWriteDebugf("action[HandleClientQosReq] vol [%v] [%v,%v,%v,%v]", qosManager.vol.Name,
reqClientInfo.FactorMap[i].Used,
reqClientInfo.FactorMap[i].Need,
reqClientInfo.FactorMap[i].UsedLimit,
reqClientInfo.FactorMap[i].UsedBuffer)
}
return
}
index := 0
wg := &sync.WaitGroup{}
wg.Add(len(reqClientInfo.FactorMap))
for factorType, clientFactor := range reqClientInfo.FactorMap {
limitRsp.FactorMap[factorType] = &proto.ClientLimitInfo{}
serverLimit := qosManager.serverFactorLimitMap[factorType]
limitRsp.Magnify[factorType] = serverLimit.magnify
request := &qosRequestArgs{
clientID: clientID,
factorType: factorType,
clientReq: clientFactor,
lastClientInfo: clientInfo.Cli.FactorMap[factorType],
assignInfo: clientInfo.Assign.FactorMap[factorType],
rsp2Client: limitRsp.FactorMap[factorType],
wg: wg,
}
serverLimit.requestCh <- request
index++
}
wg.Wait()
clientInfo.Cli = reqClientInfo
clientInfo.Assign = limitRsp
clientInfo.Time = time.Now()
return
}
func (qosManager *QosCtrlManager) updateServerLimitByClientsInfo(factorType uint32) {
var (
cliSum proto.ClientLimitInfo
nextStageNeed, nextStageUse uint64
)
qosManager.RLock()
serverLimit := qosManager.serverFactorLimitMap[factorType]
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] last limitInfo(%v)",
qosManager.vol.Name, proto.QosTypeString(factorType), serverLimit)
// get sum of data from all clients reports
for host, cliInfo := range qosManager.cliInfoMgrMap {
cliFactor := cliInfo.Cli.FactorMap[factorType]
cliSum.Used += cliFactor.Used
cliSum.Need += cliFactor.Need
cliSum.UsedLimit += cliFactor.UsedLimit
cliSum.UsedBuffer += cliFactor.UsedBuffer
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] Host [%v] type [%v] used [%v] need [%v] limit [%v] buffer [%v]",
qosManager.vol.Name, host, proto.QosTypeString(factorType),
cliFactor.Used, cliFactor.Need, cliFactor.UsedLimit, cliFactor.UsedBuffer)
}
serverLimit.CliUsed = cliSum.Used
serverLimit.CliNeed = cliSum.Need
qosManager.RUnlock()
if !qosManager.qosEnable {
return
}
serverLimit.Buffer = 0
nextStageUse = cliSum.Used
nextStageNeed = cliSum.Need
if serverLimit.Total >= nextStageUse {
serverLimit.Buffer = serverLimit.Total - nextStageUse
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] reset server buffer [%v] all clients nextStageUse [%v]",
qosManager.vol.Name, serverLimit.Buffer, nextStageUse)
if nextStageNeed > serverLimit.Buffer {
nextStageNeed -= serverLimit.Buffer
nextStageUse += serverLimit.Buffer
serverLimit.Buffer = 0
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] reset server buffer [%v] all clients nextStageNeed [%v] too much",
qosManager.vol.Name, serverLimit.Buffer, nextStageNeed)
} else {
serverLimit.Buffer -= nextStageNeed
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] reset server buffer [%v] all clients nextStageNeed [%v]",
qosManager.vol.Name, serverLimit.Buffer, nextStageNeed)
nextStageUse += nextStageNeed
nextStageNeed = 0
}
} else { // usage large than limitation
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol[%v] type [%v] clients needs [%v] plus overuse [%v],get nextStageNeed [%v]",
qosManager.vol.Name, proto.QosTypeString(factorType), nextStageNeed, nextStageUse-serverLimit.Total,
nextStageNeed+nextStageUse-serverLimit.Total)
nextStageNeed += nextStageUse - serverLimit.Total
nextStageUse = serverLimit.Total
}
serverLimit.Allocated = nextStageUse
serverLimit.NeedAfterAlloc = nextStageNeed
// get the limitRate,additionFlowNeed should be zero if total used can increase
serverLimit.LimitRate = 0
if serverLimit.NeedAfterAlloc > 0 {
serverLimit.LimitRate = float32(float64(serverLimit.NeedAfterAlloc) / float64(serverLimit.Allocated+serverLimit.NeedAfterAlloc))
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] alloc not enough need limitRatio serverLimit:(%v)",
qosManager.vol.Name, proto.QosTypeString(factorType), serverLimit)
lastMagnify := serverLimit.LastMagnify
lastLimitRatio := serverLimit.LimitRate
// master assigned limit and buffer not be used as expected,we need adjust the gap
if serverLimit.CliUsed < serverLimit.Total {
if serverLimit.LimitRate > -10.0 && serverLimit.LastMagnify < serverLimit.Total*10 {
serverLimit.LastMagnify += uint64(float64(serverLimit.Total-serverLimit.CliUsed) * 0.1)
}
} else {
if serverLimit.LastMagnify > 0 {
var magnify uint64
if serverLimit.LastMagnify > (serverLimit.CliUsed - serverLimit.Total) {
magnify = serverLimit.CliUsed - serverLimit.Total
} else {
magnify = serverLimit.LastMagnify
}
serverLimit.LastMagnify -= uint64(float32(magnify) * 0.1)
}
}
serverLimit.LimitRate = serverLimit.LimitRate * float32(1-float64(serverLimit.LastMagnify)/float64(serverLimit.Allocated+serverLimit.NeedAfterAlloc))
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] limitRatio [%v] updated to limitRatio [%v] by magnify [%v] lastMagnify [%v]",
qosManager.vol.Name, proto.QosTypeString(factorType),
lastLimitRatio, serverLimit.LimitRate, serverLimit.LastMagnify, lastMagnify)
} else {
serverLimit.LastMagnify = 0
}
log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] after adjust limitRatio serverLimit:(%v)",
qosManager.vol.Name, proto.QosTypeString(factorType), serverLimit)
return
}
func (qosManager *QosCtrlManager) assignClientsNewQos(factorType uint32) {
qosManager.RLock()
if !qosManager.qosEnable {
return
}
serverLimit := qosManager.serverFactorLimitMap[factorType]
var bufferAllocated uint64
// recalculate client Assign limit and buffer
for _, cliInfoMgr := range qosManager.cliInfoMgrMap {
cliInfo := cliInfoMgr.Cli.FactorMap[factorType]
assignInfo := cliInfoMgr.Assign.FactorMap[factorType]
if cliInfo.Used+cliInfoMgr.Cli.FactorMap[factorType].Need == 0 {
assignInfo.UsedLimit = 0
assignInfo.UsedBuffer = 0
} else {
assignInfo.UsedLimit = uint64(float64(cliInfo.Used+cliInfo.Need) * float64(1-serverLimit.LimitRate))
if serverLimit.Allocated != 0 {
assignInfo.UsedBuffer = uint64(float64(serverLimit.Buffer) * (float64(assignInfo.UsedLimit) / float64(serverLimit.Allocated)) * 0.5)
}
// buffer left may be quit large and we should not use up and doesn't mean if buffer large than used limit line
if assignInfo.UsedBuffer > assignInfo.UsedLimit {
assignInfo.UsedBuffer = assignInfo.UsedLimit
}
}
bufferAllocated += assignInfo.UsedBuffer
}
qosManager.RUnlock()
if serverLimit.Buffer > bufferAllocated {
serverLimit.Buffer -= bufferAllocated
} else {
serverLimit.Buffer = 0
log.LogWarnf("action[assignClientsNewQos] vol [%v] type [%v] clients buffer [%v] and server buffer used up trigger flow limit overall",
qosManager.vol.Name, proto.QosTypeString(factorType), bufferAllocated)
}
log.QosWriteDebugf("action[assignClientsNewQos] vol [%v] type [%v] serverLimit buffer:[%v] used:[%v] need:[%v] total:[%v]",
qosManager.vol.Name, proto.QosTypeString(factorType),
serverLimit.Buffer, serverLimit.Allocated, serverLimit.NeedAfterAlloc, serverLimit.Total)
}
func (vol *Vol) checkQos() {
vol.qosManager.Lock()
// check expire client and delete from map
tTime := time.Now()
for id, cli := range vol.qosManager.cliInfoMgrMap {
if cli.Time.Add(20 * time.Second).Before(tTime) {
log.LogWarnf("action[checkQos] vol [%v] Id [%v] addr [%v] be delete in case of long time no request",
vol.Name, id, cli.Host)
delete(vol.qosManager.cliInfoMgrMap, id)
}
}
vol.qosManager.Unlock()
// periodically updateServerLimitByClientsInfo and get assigned limit info for all clients
// with last report info from client and qos control info
for factorType := proto.IopsReadType; factorType <= proto.FlowWriteType; factorType++ {
// calc all clients and get real used and need value , used value should less then total
vol.qosManager.updateServerLimitByClientsInfo(factorType)
// update client assign info by result above
if !vol.qosManager.qosEnable {
continue
}
vol.qosManager.assignClientsNewQos(factorType)
serverLimit := vol.qosManager.serverFactorLimitMap[factorType]
log.QosWriteDebugf("action[UpdateAllQosInfo] vol name [%v] type [%v] after updateServerLimitByClientsInfo get limitRate:[%v] "+
"server total [%v] beAllocated [%v] NeedAfterAlloc [%v] buffer [%v]",
vol.Name, proto.QosTypeString(factorType), serverLimit.LimitRate,
serverLimit.Total, serverLimit.Allocated, serverLimit.NeedAfterAlloc, serverLimit.Buffer)
}
}
func (vol *Vol) getQosStatus(cluster *Cluster) interface{} {
type qosStatus struct {
ServerFactorLimitMap map[uint32]*ServerFactorLimit // vol qos data for iops w/r and flow w/r
QosEnable bool
ClientReqPeriod uint32
ClientHitTriggerCnt uint32
ClusterMaxUploadCnt uint32
ClientALiveCnt int
}
vol.qosManager.RLock()
defer vol.qosManager.RUnlock()
return &qosStatus{
ServerFactorLimitMap: map[uint32]*ServerFactorLimit{
proto.FlowReadType: vol.qosManager.serverFactorLimitMap[proto.FlowReadType],
proto.FlowWriteType: vol.qosManager.serverFactorLimitMap[proto.FlowWriteType],
},
QosEnable: vol.qosManager.qosEnable,
ClientReqPeriod: vol.qosManager.ClientReqPeriod,
ClientHitTriggerCnt: vol.qosManager.ClientHitTriggerCnt,
ClusterMaxUploadCnt: uint32(cluster.QosAcceptLimit.Limit()),
ClientALiveCnt: len(vol.qosManager.cliInfoMgrMap),
}
}
func (vol *Vol) getClientLimitInfo(id uint64, ip string) (interface{}, error) {
log.QosWriteDebugf("action[getClientLimitInfo] vol [%v] id [%v] ip [%v]", vol.Name, id, ip)
vol.qosManager.RLock()
defer vol.qosManager.RUnlock()
assignFuc := func(info *ClientInfoMgr) (rspInfo *ClientInfoOutput) {
rspInfo = &ClientInfoOutput{
Cli: &ClientReportOutput{
ID: info.Cli.ID,
Status: info.Cli.Status,
FactorMap: make(map[uint32]*proto.ClientLimitInfo, 0),
},
Assign: &LimitOutput{
ID: info.Assign.ID,
Enable: info.Assign.Enable,
ReqPeriod: info.Assign.ReqPeriod,
HitTriggerCnt: info.Assign.HitTriggerCnt,
FactorMap: make(map[uint32]*proto.ClientLimitInfo, 0),
},
Time: info.Time,
Host: info.Host,
ID: info.ID,
}
rspInfo.Cli.FactorMap[proto.FlowReadType] = info.Cli.FactorMap[proto.FlowReadType]
rspInfo.Cli.FactorMap[proto.FlowWriteType] = info.Cli.FactorMap[proto.FlowWriteType]
rspInfo.Assign.FactorMap[proto.FlowReadType] = info.Assign.FactorMap[proto.FlowReadType]
rspInfo.Assign.FactorMap[proto.FlowWriteType] = info.Assign.FactorMap[proto.FlowWriteType]
return
}
if id > 0 {
if info, ok := vol.qosManager.cliInfoMgrMap[id]; ok {
if len(ip) > 0 && info.Host != ip {
return nil, fmt.Errorf("ip info [%v] not equal with request [%v]", info.Host, ip)
}
return assignFuc(info), nil
}
} else {
var resp []*ClientInfoOutput
for _, info := range vol.qosManager.cliInfoMgrMap {
// http connection port from client will change time by time,so ignore port here
rspInfo := assignFuc(info)
if len(ip) != 0 {
if info.Host == ip {
resp = append(resp, rspInfo)
}
} else {
resp = append(resp, rspInfo)
}
}
if len(resp) > 0 {
return resp, nil
}
}
return nil, fmt.Errorf("not found")
}
func (vol *Vol) volQosEnable(c *Cluster, enable bool) error {
log.LogWarnf("action[qosEnable] vol %v, set qos enable [%v], qosmgr[%v]", vol.Name, enable, vol.qosManager)
vol.qosManager.qosEnable = enable
vol.qosManager.Lock()
defer vol.qosManager.Unlock()
if !enable {
for _, limit := range vol.qosManager.cliInfoMgrMap {
for factorType := proto.IopsReadType; factorType <= proto.FlowWriteType; factorType++ {
limit.Assign.FactorMap[factorType] = &proto.ClientLimitInfo{}
}
}
}
return c.syncUpdateVol(vol)
}
func (vol *Vol) updateClientParam(c *Cluster, period, triggerCnt uint32) error {
vol.qosManager.ClientHitTriggerCnt = triggerCnt
vol.qosManager.ClientReqPeriod = period
return c.syncUpdateVol(vol)
}
func (vol *Vol) volQosUpdateLimit(c *Cluster, limitArgs *qosArgs) error {
vol.qosManager.volUpdateLimit(limitArgs)
return c.syncUpdateVol(vol)
}
type AclManager struct {
aclIps map[string]*proto.AclIpInfo
c *Cluster
vol *Vol
sync.RWMutex
}
type AclFsm struct {
AclIpArr []*proto.AclIpInfo
}
func (acl *AclManager) init(c *Cluster, vol *Vol) {
acl.c = c
acl.vol = vol
acl.aclIps = make(map[string]*proto.AclIpInfo)
}
func (acl *AclManager) aclOperate(op uint64, ip string) interface{} {
acl.Lock()
defer acl.Unlock()
switch op {
case util.AclAddIP:
return acl.addIp(ip)
case util.AclDelIP:
return acl.removeIp(ip)
case util.AclCheckIP:
return acl.checkIp(ip)
case util.AclListIP:
return acl.listAll()
default:
err := fmt.Errorf("aclOperate op %v not found", op)
return err
}
}
func (acl *AclManager) listAll() (val []*proto.AclIpInfo) {
log.LogDebugf("vol %v listAll", acl.vol.Name)
for ip, info := range acl.aclIps {
log.LogDebugf("vol %v listAll ip %v", ip, acl.vol.Name)
val = append(val, info)
}
return
}
func (acl *AclManager) checkIp(ip string) (val []*proto.AclIpInfo) {
log.LogDebugf("vol %v checkIp %v", ip, acl.vol.Name)
if info, ok := acl.aclIps[ip]; ok {
log.LogDebugf("vol %v checkIp ip %v", ip, acl.vol.Name)
val = append(val, info)
}
return
}
func (acl *AclManager) addIp(ip string) (err error) {
log.LogDebugf("vol %v acl addIp %v", acl.vol.Name, ip)
if _, ok := acl.aclIps[ip]; ok {
return
}
acl.aclIps[ip] = &proto.AclIpInfo{
Ip: ip,
CTime: time.Now().Unix(),
}
return acl.persist()
}
func (acl *AclManager) removeIp(ip string) (err error) {
log.LogDebugf("vol %v acl removeIp %v", acl.vol.Name, ip)
delete(acl.aclIps, ip)
return acl.persist()
}
func (acl *AclManager) persist() (err error) {
log.LogDebugf("vol %v acl persist", acl.vol.Name)
var aclFsm AclFsm
for _, t := range acl.aclIps {
aclFsm.AclIpArr = append(aclFsm.AclIpArr, t)
}
var val []byte
if val, err = json.Marshal(aclFsm); err != nil {
log.LogErrorf("vol %v acl persist error %v", acl.vol.Name, err)
return
}
if err = acl.c.syncAclList(acl.vol, val); err != nil {
log.LogErrorf("vol %v acl persist syncAclList error %v", acl.vol.Name, err)
return
}
return
}
func (acl *AclManager) load(c *Cluster, val []byte) (err error) {
log.LogDebugf("vol %v acl load meta", acl.vol.Name)
acl.c = c
aclFsm := &AclFsm{}
if err = json.Unmarshal(val, aclFsm); err != nil {
log.LogErrorf("vol %v acl load %v", acl.vol.Name, err)
return
}
for _, info := range aclFsm.AclIpArr {
acl.aclIps[info.Ip] = info
log.LogDebugf("vol %v acl load %v", acl.vol.Name, info.Ip)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
syslog "log"
"strings"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
cfsProto "github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
// LeaderInfo represents the leader's information
type LeaderInfo struct {
addr string //host:port
}
func (m *Server) handleLeaderChange(leader uint64) {
if leader == 0 {
log.LogWarnf("action[handleLeaderChange] but no leader")
if WarnMetrics != nil {
WarnMetrics.reset()
}
return
}
oldLeaderAddr := m.leaderInfo.addr
m.leaderInfo.addr = AddrDatabase[leader]
log.LogWarnf("action[handleLeaderChange] [%v] ", m.leaderInfo.addr)
m.reverseProxy = m.newReverseProxy()
if m.id == leader {
Warn(m.clusterName, fmt.Sprintf("clusterID[%v] leader is changed to %v",
m.clusterName, m.leaderInfo.addr))
if oldLeaderAddr != m.leaderInfo.addr {
m.cluster.checkPersistClusterValue()
m.loadMetadata()
m.cluster.metaReady = true
m.metaReady = true
}
m.cluster.checkDataNodeHeartbeat()
m.cluster.checkMetaNodeHeartbeat()
m.cluster.checkLcNodeHeartbeat()
m.cluster.followerReadManager.reSet()
} else {
Warn(m.clusterName, fmt.Sprintf("clusterID[%v] leader is changed to %v",
m.clusterName, m.leaderInfo.addr))
m.clearMetadata()
m.metaReady = false
m.cluster.metaReady = false
m.cluster.masterClient.AddNode(m.leaderInfo.addr)
m.cluster.masterClient.SetLeader(m.leaderInfo.addr)
if WarnMetrics != nil {
WarnMetrics.reset()
}
}
}
func (m *Server) handlePeerChange(confChange *proto.ConfChange) (err error) {
var msg string
addr := string(confChange.Context)
switch confChange.Type {
case proto.ConfAddNode:
var arr []string
if arr = strings.Split(addr, colonSplit); len(arr) < 2 {
msg = fmt.Sprintf("action[handlePeerChange] clusterID[%v] nodeAddr[%v] is invalid", m.clusterName, addr)
break
}
m.raftStore.AddNodeWithPort(confChange.Peer.ID, arr[0], int(m.config.heartbeatPort), int(m.config.replicaPort))
AddrDatabase[confChange.Peer.ID] = string(confChange.Context)
msg = fmt.Sprintf("clusterID[%v] peerID:%v,nodeAddr[%v] has been add", m.clusterName, confChange.Peer.ID, addr)
case proto.ConfRemoveNode:
m.raftStore.DeleteNode(confChange.Peer.ID)
msg = fmt.Sprintf("clusterID[%v] peerID:%v,nodeAddr[%v] has been removed", m.clusterName, confChange.Peer.ID, addr)
default:
// do nothing
}
Warn(m.clusterName, msg)
return
}
func (m *Server) handleApplySnapshot() {
m.fsm.restore()
m.restoreIDAlloc()
return
}
func (m *Server) handleRaftUserCmd(opt uint32, key string, cmdMap map[string][]byte) (err error) {
log.LogInfof("action[handleRaftUserCmd] opt %v, key %v, map len %v", opt, key, len(cmdMap))
switch opt {
case opSyncPutFollowerApiLimiterInfo, opSyncPutApiLimiterInfo:
if m.cluster != nil && !m.partition.IsRaftLeader() {
m.cluster.apiLimiter.updateLimiterInfoFromLeader(cmdMap[key])
}
default:
log.LogErrorf("action[handleRaftUserCmd] opt %v not supported,key %v, map len %v", opt, key, len(cmdMap))
}
return nil
}
func (m *Server) restoreIDAlloc() {
m.cluster.idAlloc.restore()
}
// Load stored metadata into the memory
func (m *Server) loadMetadata() {
log.LogInfo("action[loadMetadata] begin")
syslog.Println("action[loadMetadata] begin")
m.clearMetadata()
m.restoreIDAlloc()
m.cluster.fsm.restore()
var err error
if err = m.cluster.loadClusterValue(); err != nil {
panic(err)
}
var loadDomain bool
if m.cluster.FaultDomain { // try load exclude
if loadDomain, err = m.cluster.loadZoneDomain(); err != nil {
log.LogInfof("action[putZoneDomain] err[%v]", err)
panic(err)
}
if err = m.cluster.loadNodeSetGrps(); err != nil {
panic(err)
}
if loadDomain {
// if load success the domain already init before this startup,
// start grp manager ,load nodeset can trigger build ns grps
m.cluster.domainManager.start()
}
}
if err = m.cluster.loadNodeSets(); err != nil {
panic(err)
}
if m.cluster.FaultDomain {
log.LogInfof("action[FaultDomain] set")
if !loadDomain { // first restart after domain item be added
if err = m.cluster.putZoneDomain(true); err != nil {
log.LogInfof("action[putZoneDomain] err[%v]", err)
panic(err)
}
m.cluster.domainManager.start()
}
}
if err = m.cluster.loadDataNodes(); err != nil {
panic(err)
}
if err = m.cluster.loadMetaNodes(); err != nil {
panic(err)
}
if err = m.cluster.loadZoneValue(); err != nil {
panic(err)
}
if err = m.cluster.loadVols(); err != nil {
panic(err)
}
if err = m.cluster.loadMetaPartitions(); err != nil {
panic(err)
}
if err = m.cluster.loadDataPartitions(); err != nil {
panic(err)
}
if err = m.cluster.loadDecommissionDiskList(); err != nil {
panic(err)
}
if err = m.cluster.startDecommissionListTraverse(); err != nil {
panic(err)
}
log.LogInfo("action[loadMetadata] end")
log.LogInfo("action[loadUserInfo] begin")
if err = m.user.loadUserStore(); err != nil {
panic(err)
}
if err = m.user.loadAKStore(); err != nil {
panic(err)
}
if err = m.user.loadVolUsers(); err != nil {
panic(err)
}
log.LogInfo("action[loadUserInfo] end")
log.LogInfo("action[refreshUser] begin")
if err = m.refreshUser(); err != nil {
panic(err)
}
log.LogInfo("action[refreshUser] end")
log.LogInfo("action[loadApiLimiterInfo] begin")
if err = m.cluster.loadApiLimiterInfo(); err != nil {
panic(err)
}
log.LogInfo("action[loadApiLimiterInfo] end")
log.LogInfo("action[loadQuota] begin")
if err = m.cluster.loadQuota(); err != nil {
panic(err)
}
log.LogInfo("action[loadQuota] end")
log.LogInfo("action[loadLcConfs] begin")
if err = m.cluster.loadLcConfs(); err != nil {
panic(err)
}
log.LogInfo("action[loadLcConfs] end")
log.LogInfo("action[loadLcNodes] begin")
if err = m.cluster.loadLcNodes(); err != nil {
panic(err)
}
log.LogInfo("action[loadLcNodes] end")
syslog.Println("action[loadMetadata] end")
log.LogInfo("action[loadS3QoSInfo] begin")
if err = m.cluster.loadS3ApiQosInfo(); err != nil {
panic(err)
}
log.LogInfo("action[loadS3QoSInfo] end")
}
func (m *Server) clearMetadata() {
m.cluster.clearTopology()
m.cluster.clearDataNodes()
m.cluster.clearMetaNodes()
m.cluster.clearLcNodes()
m.cluster.clearVols()
if m.user != nil {
// leader change event may be before m.user initialization
m.user.clearUserStore()
m.user.clearAKStore()
m.user.clearVolUsers()
}
m.cluster.t = newTopology()
// m.cluster.apiLimiter.Clear()
}
func (m *Server) refreshUser() (err error) {
/* todo create user automatically
var userInfo *cfsProto.UserInfo
for volName, vol := range m.cluster.allVols() {
if _, err = m.user.getUserInfo(vol.Owner); err == cfsProto.ErrUserNotExists {
if len(vol.OSSAccessKey) > 0 && len(vol.OSSSecretKey) > 0 {
var param = cfsProto.UserCreateParam{
ID: vol.Owner,
Password: DefaultUserPassword,
AccessKey: vol.OSSAccessKey,
SecretKey: vol.OSSSecretKey,
Type: cfsProto.UserTypeNormal,
}
userInfo, err = m.user.createKey(¶m)
if err != nil && err != cfsProto.ErrDuplicateUserID && err != cfsProto.ErrDuplicateAccessKey {
return err
}
} else {
var param = cfsProto.UserCreateParam{
ID: vol.Owner,
Password: DefaultUserPassword,
Type: cfsProto.UserTypeNormal,
}
userInfo, err = m.user.createKey(¶m)
if err != nil && err != cfsProto.ErrDuplicateUserID {
return err
}
}
if err == nil && userInfo != nil {
if _, err = m.user.addOwnVol(userInfo.UserID, volName); err != nil {
return err
}
}
}
}*/
if _, err = m.user.getUserInfo(RootUserID); err != nil {
param := cfsProto.UserCreateParam{
ID: RootUserID,
Password: DefaultRootPasswd,
Type: cfsProto.UserTypeRoot,
}
if _, err = m.user.createKey(¶m); err != nil {
return err
}
}
return nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"strconv"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
type MasterQuotaManager struct {
MpQuotaInfoMap map[uint64][]*proto.QuotaReportInfo
IdQuotaInfoMap map[uint32]*proto.QuotaInfo
vol *Vol
c *Cluster
sync.RWMutex
}
func (mqMgr *MasterQuotaManager) createQuota(req *proto.SetMasterQuotaReuqest) (quotaId uint32, err error) {
mqMgr.Lock()
defer mqMgr.Unlock()
if len(mqMgr.IdQuotaInfoMap) >= gConfig.MaxQuotaNumPerVol {
err = errors.NewErrorf("the number of quota has reached the upper limit %v", len(mqMgr.IdQuotaInfoMap))
return
}
for _, quotaInfo := range mqMgr.IdQuotaInfoMap {
for _, pathInfo := range req.PathInfos {
for _, quotaPathInfo := range quotaInfo.PathInfos {
if pathInfo.RootInode == quotaPathInfo.RootInode {
err = errors.NewErrorf("path [%v] is the same as quotaId [%v]",
pathInfo.FullPath, quotaInfo.QuotaId)
return
}
if pathInfo.FullPath == quotaPathInfo.FullPath {
err = errors.NewErrorf("path [%v] is the same as quotaId [%v]",
pathInfo.FullPath, quotaInfo.QuotaId)
return
}
if proto.IsAncestor(pathInfo.FullPath, quotaPathInfo.FullPath) {
err = errors.NewErrorf("Nested directories found: %s and %s", pathInfo.FullPath, quotaPathInfo.FullPath)
return
}
if proto.IsAncestor(quotaPathInfo.FullPath, pathInfo.FullPath) {
err = errors.NewErrorf("Nested directories found: %s and %s", pathInfo.FullPath, quotaPathInfo.FullPath)
return
}
}
}
}
if quotaId, err = mqMgr.c.idAlloc.allocateQuotaID(); err != nil {
return
}
quotaInfo := &proto.QuotaInfo{
VolName: req.VolName,
QuotaId: quotaId,
CTime: time.Now().Unix(),
PathInfos: make([]proto.QuotaPathInfo, 0, 0),
MaxFiles: req.MaxFiles,
MaxBytes: req.MaxBytes,
}
for _, pathInfo := range req.PathInfos {
quotaInfo.PathInfos = append(quotaInfo.PathInfos, pathInfo)
}
var value []byte
if value, err = json.Marshal(quotaInfo); err != nil {
log.LogErrorf("create quota [%v] marsha1 fail [%v].", quotaInfo, err)
return
}
metadata := new(RaftCmd)
metadata.Op = opSyncSetQuota
metadata.K = quotaPrefix + strconv.FormatUint(mqMgr.vol.ID, 10) + keySeparator + strconv.FormatUint(uint64(quotaId), 10)
metadata.V = value
if err = mqMgr.c.submit(metadata); err != nil {
log.LogErrorf("create quota [%v] submit fail [%v].", quotaInfo, err)
return
}
// for _, pathInfo := range req.PathInfos {
// var inodes = make([]uint64, 0)
// inodes = append(inodes, pathInfo.RootInode)
// request := &proto.BatchSetMetaserverQuotaReuqest{
// PartitionId: pathInfo.PartitionId,
// Inodes: inodes,
// QuotaId: quotaId,
// }
// if err = mqMgr.setQuotaToMetaNode(request); err != nil {
// log.LogErrorf("create quota [%v] to metanode fail [%v].", quotaInfo, err)
// return
// }
// }
mqMgr.IdQuotaInfoMap[quotaId] = quotaInfo
log.LogInfof("create quota [%v] success.", quotaInfo)
return
}
func (mqMgr *MasterQuotaManager) updateQuota(req *proto.UpdateMasterQuotaReuqest) (err error) {
mqMgr.Lock()
defer mqMgr.Unlock()
quotaInfo, isFind := mqMgr.IdQuotaInfoMap[req.QuotaId]
if !isFind {
log.LogErrorf("vol [%v] quota quotaId [%v] is not exist.", mqMgr.vol.Name, req.QuotaId)
err = errors.New("quota is not exist.")
return
}
quotaInfo.MaxFiles = req.MaxFiles
quotaInfo.MaxBytes = req.MaxBytes
var value []byte
if value, err = json.Marshal(quotaInfo); err != nil {
log.LogErrorf("update quota [%v] marsha1 fail [%v].", quotaInfo, err)
return
}
metadata := new(RaftCmd)
metadata.Op = opSyncSetQuota
metadata.K = quotaPrefix + strconv.FormatUint(mqMgr.vol.ID, 10) + keySeparator + strconv.FormatUint(uint64(quotaInfo.QuotaId), 10)
metadata.V = value
if err = mqMgr.c.submit(metadata); err != nil {
log.LogErrorf("update quota [%v] submit fail [%v].", quotaInfo, err)
return
}
log.LogInfof("update quota [%v] success.", *quotaInfo)
return
}
func (mqMgr *MasterQuotaManager) listQuota() (resp *proto.ListMasterQuotaResponse) {
mqMgr.RLock()
defer mqMgr.RUnlock()
resp = &proto.ListMasterQuotaResponse{}
resp.Quotas = make([]*proto.QuotaInfo, 0)
for _, info := range mqMgr.IdQuotaInfoMap {
resp.Quotas = append(resp.Quotas, info)
}
return
}
func (mqMgr *MasterQuotaManager) getQuota(quotaId uint32) (quotaInfo *proto.QuotaInfo, err error) {
mqMgr.RLock()
defer mqMgr.RUnlock()
quotaInfo, isFind := mqMgr.IdQuotaInfoMap[quotaId]
if !isFind {
err = errors.New("quota is not exist.")
return nil, err
}
return quotaInfo, nil
}
func (mqMgr *MasterQuotaManager) deleteQuota(quotaId uint32) (err error) {
mqMgr.Lock()
defer mqMgr.Unlock()
quotaInfo, isFind := mqMgr.IdQuotaInfoMap[quotaId]
if !isFind {
log.LogErrorf("vol [%v] quota quotaId [%v] is not exist.", mqMgr.vol.Name, quotaId)
err = errors.New("quota is not exist.")
return
}
var value []byte
if value, err = json.Marshal(quotaInfo); err != nil {
log.LogErrorf("delete quota [%v] marsha1 fail [%v].", quotaInfo, err)
return
}
metadata := new(RaftCmd)
metadata.Op = opSyncDeleteQuota
metadata.K = quotaPrefix + strconv.FormatUint(mqMgr.vol.ID, 10) + keySeparator + strconv.FormatUint(uint64(quotaInfo.QuotaId), 10)
metadata.V = value
if err = mqMgr.c.submit(metadata); err != nil {
log.LogErrorf("delete quota [%v] submit fail [%v].", quotaInfo, err)
return
}
delete(mqMgr.IdQuotaInfoMap, quotaInfo.QuotaId)
log.LogInfof("deleteQuota: idmap len [%v]", len(mqMgr.IdQuotaInfoMap))
return
}
func (mqMgr *MasterQuotaManager) quotaUpdate(report *proto.MetaPartitionReport) {
var (
quotaInfo = &proto.QuotaInfo{}
id uint32
)
mqMgr.Lock()
defer mqMgr.Unlock()
mpId := report.PartitionID
if !report.IsLeader {
return
}
mqMgr.MpQuotaInfoMap[mpId] = report.QuotaReportInfos
for _, quotaInfo = range mqMgr.IdQuotaInfoMap {
quotaInfo.UsedInfo.UsedFiles = 0
quotaInfo.UsedInfo.UsedBytes = 0
}
deleteQuotaIds := make(map[uint32]bool, 0)
for mpId, reportInfos := range mqMgr.MpQuotaInfoMap {
for _, info := range reportInfos {
if _, isFind := mqMgr.IdQuotaInfoMap[info.QuotaId]; !isFind {
deleteQuotaIds[info.QuotaId] = true
continue
}
log.LogDebugf("[quotaUpdate] mpId [%v] quotaId [%v] reportinfo [%v]", mpId, info.QuotaId, info.UsedInfo)
quotaInfo = mqMgr.IdQuotaInfoMap[info.QuotaId]
quotaInfo.UsedInfo.Add(&info.UsedInfo)
}
}
if len(deleteQuotaIds) != 0 {
log.LogWarnf("[quotaUpdate] quotaIds [%v] is delete", deleteQuotaIds)
}
for id, quotaInfo = range mqMgr.IdQuotaInfoMap {
if quotaInfo.IsOverQuotaFiles() {
quotaInfo.LimitedInfo.LimitedFiles = true
} else {
quotaInfo.LimitedInfo.LimitedFiles = false
}
if quotaInfo.IsOverQuotaBytes() {
quotaInfo.LimitedInfo.LimitedBytes = true
} else {
quotaInfo.LimitedInfo.LimitedBytes = false
}
log.LogDebugf("[quotaUpdate] quotaId [%v] quotaInfo [%v]", id, quotaInfo)
}
return
}
func (mqMgr *MasterQuotaManager) getQuotaHbInfos() (infos []*proto.QuotaHeartBeatInfo) {
mqMgr.RLock()
defer mqMgr.RUnlock()
for quotaId, quotaInfo := range mqMgr.IdQuotaInfoMap {
info := &proto.QuotaHeartBeatInfo{}
info.VolName = mqMgr.vol.Name
info.QuotaId = quotaId
info.LimitedInfo.LimitedFiles = quotaInfo.LimitedInfo.LimitedFiles
info.LimitedInfo.LimitedBytes = quotaInfo.LimitedInfo.LimitedBytes
info.Enable = mqMgr.vol.enableQuota
infos = append(infos, info)
log.LogDebugf("getQuotaHbInfos info %v", info)
}
return
}
func (mqMgr *MasterQuotaManager) HasQuota() bool {
mqMgr.RLock()
defer mqMgr.RUnlock()
if len(mqMgr.IdQuotaInfoMap) == 0 {
return false
}
return true
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/atomicutil"
"github.com/cubefs/cubefs/util/log"
)
// MetaNode defines the structure of a meta node
type MetaNode struct {
ID uint64
Addr string
DomainAddr string
IsActive bool
Sender *AdminTaskManager `graphql:"-"`
ZoneName string `json:"Zone"`
MaxMemAvailWeight uint64 `json:"MaxMemAvailWeight"`
Total uint64 `json:"TotalWeight"`
Used uint64 `json:"UsedWeight"`
Ratio float64
SelectCount uint64
Threshold float32
ReportTime time.Time
metaPartitionInfos []*proto.MetaPartitionReport
MetaPartitionCount int
NodeSetID uint64
sync.RWMutex `graphql:"-"`
ToBeOffline bool
PersistenceMetaPartitions []uint64
RdOnly bool
MigrateLock sync.RWMutex
CpuUtil atomicutil.Float64 `json:"-"`
}
func newMetaNode(addr, zoneName, clusterID string) (node *MetaNode) {
node = &MetaNode{
Addr: addr,
ZoneName: zoneName,
Sender: newAdminTaskManager(addr, clusterID),
}
node.CpuUtil.Store(0)
return
}
func (metaNode *MetaNode) clean() {
metaNode.Sender.exitCh <- struct{}{}
}
func (metaNode *MetaNode) GetID() uint64 {
metaNode.RLock()
defer metaNode.RUnlock()
return metaNode.ID
}
func (metaNode *MetaNode) GetAddr() string {
metaNode.RLock()
defer metaNode.RUnlock()
return metaNode.Addr
}
// SelectNodeForWrite implements the Node interface
func (metaNode *MetaNode) SelectNodeForWrite() {
metaNode.Lock()
defer metaNode.Unlock()
metaNode.SelectCount++
}
func (metaNode *MetaNode) isWritable() (ok bool) {
metaNode.RLock()
defer metaNode.RUnlock()
if metaNode.IsActive && metaNode.MaxMemAvailWeight > gConfig.metaNodeReservedMem &&
!metaNode.reachesThreshold() && metaNode.MetaPartitionCount < defaultMaxMetaPartitionCountOnEachNode &&
!metaNode.RdOnly {
ok = true
}
return
}
func (metaNode *MetaNode) setNodeActive() {
metaNode.Lock()
defer metaNode.Unlock()
metaNode.ReportTime = time.Now()
metaNode.IsActive = true
}
func (metaNode *MetaNode) updateMetric(resp *proto.MetaNodeHeartbeatResponse, threshold float32) {
metaNode.Lock()
defer metaNode.Unlock()
metaNode.DomainAddr = util.ParseIpAddrToDomainAddr(metaNode.Addr)
metaNode.metaPartitionInfos = resp.MetaPartitionReports
metaNode.MetaPartitionCount = len(metaNode.metaPartitionInfos)
metaNode.Total = resp.Total
metaNode.Used = resp.MemUsed
if resp.Total == 0 {
metaNode.Ratio = 0
} else {
metaNode.Ratio = float64(resp.MemUsed) / float64(resp.Total)
}
left := int64(resp.Total - resp.MemUsed)
if left < 0 {
metaNode.MaxMemAvailWeight = 0
} else {
metaNode.MaxMemAvailWeight = uint64(left)
}
metaNode.ZoneName = resp.ZoneName
metaNode.Threshold = threshold
}
func (metaNode *MetaNode) reachesThreshold() bool {
if metaNode.Threshold <= 0 {
metaNode.Threshold = defaultMetaPartitionMemUsageThreshold
}
return float32(float64(metaNode.Used)/float64(metaNode.Total)) > metaNode.Threshold
}
func (metaNode *MetaNode) createHeartbeatTask(masterAddr string, fileStatsEnable bool) (task *proto.AdminTask) {
request := &proto.HeartBeatRequest{
CurrTime: time.Now().Unix(),
MasterAddr: masterAddr,
}
request.FileStatsEnable = fileStatsEnable
task = proto.NewAdminTask(proto.OpMetaNodeHeartbeat, metaNode.Addr, request)
return
}
func (metaNode *MetaNode) createVersionTask(volume string, version uint64, op uint8, addr string, verList []*proto.VolVersionInfo) (task *proto.AdminTask) {
request := &proto.MultiVersionOpRequest{
VolumeID: volume,
VerSeq: version,
Op: op,
Addr: addr,
VolVerList: verList,
}
task = proto.NewAdminTask(proto.OpVersionOperation, metaNode.Addr, request)
return
}
func (metaNode *MetaNode) checkHeartbeat() {
metaNode.Lock()
defer metaNode.Unlock()
if time.Since(metaNode.ReportTime) > time.Second*time.Duration(defaultNodeTimeOutSec) {
metaNode.IsActive = false
}
}
// LeaderMetaNode define the leader metaPartitions in meta node
type LeaderMetaNode struct {
addr string
metaPartitions []*MetaPartition
}
type sortLeaderMetaNode struct {
nodes []*LeaderMetaNode
leaderCountM map[string]int
average int
mu sync.RWMutex
}
func (s *sortLeaderMetaNode) Less(i, j int) bool {
return len(s.nodes[i].metaPartitions) > len(s.nodes[j].metaPartitions)
}
func (s *sortLeaderMetaNode) Swap(i, j int) {
s.nodes[i], s.nodes[j] = s.nodes[j], s.nodes[i]
}
func (s *sortLeaderMetaNode) Len() int {
return len(s.nodes)
}
func (s *sortLeaderMetaNode) getLeaderCount(addr string) int {
s.mu.RLock()
defer s.mu.RUnlock()
return s.leaderCountM[addr]
}
func (s *sortLeaderMetaNode) changeLeader(l *LeaderMetaNode) {
for _, mp := range l.metaPartitions {
if count := s.getLeaderCount(l.addr); count <= s.average {
log.LogInfof("now leader count is[%d], average is[%d]", count, s.average)
break
}
// mp's leader not in this metaNode, skip it
oldLeader, err := mp.getMetaReplicaLeader()
if err != nil {
log.LogErrorf("mp[%v] no leader, can not change leader err[%v]", mp, err)
continue
}
// get the leader metaPartition count meta node which smaller than (old leader count - 1) addr as new leader
addr := oldLeader.Addr
s.mu.RLock()
for i := 0; i < len(mp.Replicas); i++ {
if s.leaderCountM[mp.Replicas[i].Addr] < s.leaderCountM[oldLeader.Addr]-1 {
addr = mp.Replicas[i].Addr
}
}
s.mu.RUnlock()
if addr == oldLeader.Addr {
log.LogDebugf("newAddr:%s,oldAddr:%s is same", addr, oldLeader.Addr)
continue
}
// one mp change leader failed not influence others
if err = mp.tryToChangeLeaderByHost(addr); err != nil {
log.LogErrorf("mp[%v] change to addr[%v] err[%v]", mp, addr, err)
continue
}
s.mu.Lock()
s.leaderCountM[addr]++
s.leaderCountM[oldLeader.Addr]--
s.mu.Unlock()
log.LogDebugf("mp[%v] oldLeader[%v,nowCount:%d] change to newLeader[%v,nowCount:%d] success", mp.PartitionID, oldLeader.Addr, s.leaderCountM[oldLeader.Addr], addr, s.leaderCountM[addr])
}
}
func (s *sortLeaderMetaNode) balanceLeader() {
for _, node := range s.nodes {
log.LogDebugf("node[%v] leader count is:%d,average:%d", node.addr, len(node.metaPartitions), s.average)
s.changeLeader(node)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// MetaReplica defines the replica of a meta partition
type MetaReplica struct {
Addr string
start uint64 // lower bound of the inode id
end uint64 // upper bound of the inode id
dataSize uint64
nodeID uint64
MaxInodeID uint64
InodeCount uint64
DentryCount uint64
TxCnt uint64
TxRbInoCnt uint64
TxRbDenCnt uint64
FreeListLen uint64
ReportTime int64
Status int8 // unavailable, readOnly, readWrite
IsLeader bool
metaNode *MetaNode
}
// MetaPartition defines the structure of a meta partition
type MetaPartition struct {
PartitionID uint64
Start uint64
End uint64
MaxInodeID uint64
InodeCount uint64
DentryCount uint64
FreeListLen uint64
TxCnt uint64
TxRbInoCnt uint64
TxRbDenCnt uint64
Replicas []*MetaReplica
LeaderReportTime int64
ReplicaNum uint8
Status int8
IsRecover bool
volID uint64
volName string
Hosts []string
Peers []proto.Peer
OfflinePeerID uint64
MissNodes map[string]int64
LoadResponse []*proto.MetaPartitionLoadResponse
offlineMutex sync.RWMutex
uidInfo []*proto.UidReportSpaceInfo
EqualCheckPass bool
VerSeq uint64
heartBeatDone bool
sync.RWMutex
}
func newMetaReplica(start, end uint64, metaNode *MetaNode) (mr *MetaReplica) {
mr = &MetaReplica{start: start, end: end, nodeID: metaNode.ID, Addr: metaNode.Addr}
mr.metaNode = metaNode
mr.ReportTime = time.Now().Unix()
return
}
func newMetaPartition(partitionID, start, end uint64, replicaNum uint8, volName string, volID uint64, verSeq uint64) (mp *MetaPartition) {
mp = &MetaPartition{PartitionID: partitionID, Start: start, End: end, volName: volName, volID: volID}
mp.ReplicaNum = replicaNum
mp.Replicas = make([]*MetaReplica, 0)
mp.LeaderReportTime = time.Now().Unix()
mp.Status = proto.Unavailable
mp.MissNodes = make(map[string]int64, 0)
mp.Peers = make([]proto.Peer, 0)
mp.Hosts = make([]string, 0)
mp.VerSeq = verSeq
mp.LoadResponse = make([]*proto.MetaPartitionLoadResponse, 0)
mp.EqualCheckPass = true
return
}
func (mp *MetaPartition) setPeers(peers []proto.Peer) {
mp.Peers = peers
}
func (mp *MetaPartition) setHosts(hosts []string) {
mp.Hosts = hosts
}
func (mp *MetaPartition) hostsToString() (hosts string) {
return strings.Join(mp.Hosts, underlineSeparator)
}
func (mp *MetaPartition) addReplica(mr *MetaReplica) {
for _, m := range mp.Replicas {
if m.Addr == mr.Addr {
return
}
}
mp.Replicas = append(mp.Replicas, mr)
return
}
func (mp *MetaPartition) removeReplica(mr *MetaReplica) {
var newReplicas []*MetaReplica
for _, m := range mp.Replicas {
if m.Addr == mr.Addr {
continue
}
newReplicas = append(newReplicas, m)
}
mp.Replicas = newReplicas
return
}
func (mp *MetaPartition) removeReplicaByAddr(addr string) {
var newReplicas []*MetaReplica
for _, m := range mp.Replicas {
if m.Addr == addr {
continue
}
newReplicas = append(newReplicas, m)
}
mp.Replicas = newReplicas
return
}
func (mp *MetaPartition) updateInodeIDRangeForAllReplicas() {
for _, mr := range mp.Replicas {
mr.end = mp.End
}
}
// canSplit caller must be add lock
func (mp *MetaPartition) canSplit(end uint64, metaPartitionInodeIdStep uint64, ignoreNoLeader bool) (err error) {
if end < mp.Start {
err = fmt.Errorf("end[%v] less than mp.start[%v]", end, mp.Start)
return
}
// overflow
if end > (defaultMaxMetaPartitionInodeID - metaPartitionInodeIdStep) {
msg := fmt.Sprintf("action[updateInodeIDRange] vol[%v] partitionID[%v] nextStart[%v] "+
"to prevent overflow ,not update end", mp.volName, mp.PartitionID, end)
log.LogWarn(msg)
err = fmt.Errorf(msg)
return
}
if end <= mp.MaxInodeID {
err = fmt.Errorf("next meta partition start must be larger than %v", mp.MaxInodeID)
return
}
if ignoreNoLeader {
return
}
if _, err = mp.getMetaReplicaLeader(); err != nil {
log.LogWarnf("action[updateInodeIDRange] vol[%v] id[%v] no leader", mp.volName, mp.PartitionID)
return
}
return
}
func (mp *MetaPartition) addUpdateMetaReplicaTask(c *Cluster) (err error) {
tasks := make([]*proto.AdminTask, 0)
t := mp.createTaskToUpdateMetaReplica(c.Name, mp.PartitionID, mp.End)
// if no leader,don't update end
if t == nil {
err = proto.ErrNoLeader
return
}
tasks = append(tasks, t)
c.addMetaNodeTasks(tasks)
log.LogWarnf("action[addUpdateMetaReplicaTask] partitionID[%v] end[%v] success", mp.PartitionID, mp.End)
return
}
func (mp *MetaPartition) dataSize() uint64 {
maxSize := uint64(0)
for _, mr := range mp.Replicas {
if maxSize < mr.dataSize {
maxSize = mr.dataSize
}
}
return maxSize
}
func (mp *MetaPartition) checkEnd(c *Cluster, maxPartitionID uint64) {
if mp.PartitionID < maxPartitionID {
return
}
vol, err := c.getVol(mp.volName)
if err != nil {
log.LogWarnf("action[checkEnd] vol[%v] not exist", mp.volName)
return
}
vol.createMpMutex.Lock()
defer vol.createMpMutex.Unlock()
curMaxPartitionID := vol.maxPartitionID()
if mp.PartitionID != curMaxPartitionID {
log.LogWarnf("action[checkEnd] partition[%v] not max partition[%v]", mp.PartitionID, curMaxPartitionID)
return
}
mp.Lock()
defer mp.Unlock()
if _, err = mp.getMetaReplicaLeader(); err != nil {
log.LogWarnf("action[checkEnd] partition[%v] no leader", mp.PartitionID)
return
}
if mp.End != defaultMaxMetaPartitionInodeID {
oldEnd := mp.End
mp.End = defaultMaxMetaPartitionInodeID
if err := c.syncUpdateMetaPartition(mp); err != nil {
mp.End = oldEnd
log.LogErrorf("action[checkEnd] partitionID[%v] err[%v]", mp.PartitionID, err)
return
}
if err = mp.addUpdateMetaReplicaTask(c); err != nil {
mp.End = oldEnd
}
}
log.LogDebugf("action[checkEnd] partitionID[%v] end[%v]", mp.PartitionID, mp.End)
}
func (mp *MetaPartition) getMetaReplica(addr string) (mr *MetaReplica, err error) {
for _, mr = range mp.Replicas {
if mr.Addr == addr {
return
}
}
return nil, metaReplicaNotFound(addr)
}
func (mp *MetaPartition) removeMissingReplica(addr string) {
if _, ok := mp.MissNodes[addr]; ok {
delete(mp.MissNodes, addr)
}
}
func (mp *MetaPartition) isLeaderExist() bool {
mp.RLock()
defer mp.RUnlock()
for _, mr := range mp.Replicas {
if mr.IsLeader {
return true
}
}
return false
}
func (mp *MetaPartition) checkLeader(clusterID string) {
mp.Lock()
defer mp.Unlock()
for _, mr := range mp.Replicas {
if !mr.isActive() {
mr.IsLeader = false
}
}
var report bool
if _, err := mp.getMetaReplicaLeader(); err != nil {
report = true
}
if WarnMetrics != nil {
WarnMetrics.WarnMpNoLeader(clusterID, mp.PartitionID, report)
}
return
}
func (mp *MetaPartition) checkStatus(clusterID string, writeLog bool, replicaNum int, maxPartitionID uint64, metaPartitionInodeIdStep uint64, forbiddenVol bool) (doSplit bool) {
mp.Lock()
defer mp.Unlock()
mp.checkReplicas()
liveReplicas := mp.getLiveReplicas()
if len(liveReplicas) <= replicaNum/2 {
mp.Status = proto.Unavailable
} else {
mr, err := mp.getMetaReplicaLeader()
if err != nil {
mp.Status = proto.Unavailable
log.LogErrorf("[checkStatus] mp %v getMetaReplicaLeader err:%v", mp.PartitionID, err)
}
mp.Status = mr.Status
for _, replica := range liveReplicas {
if replica.Status == proto.ReadOnly {
mp.Status = proto.ReadOnly
}
if mr.metaNode == nil {
continue
}
if !mr.metaNode.reachesThreshold() && mp.InodeCount < metaPartitionInodeIdStep {
continue
}
if mp.PartitionID == maxPartitionID {
log.LogInfof("split[checkStatus] need split,id:%v,status:%v,replicaNum:%v,InodeCount:%v", mp.PartitionID, mp.Status, mp.ReplicaNum, mp.InodeCount)
doSplit = true
} else {
if mr.metaNode.reachesThreshold() || mp.End-mp.MaxInodeID > 2*metaPartitionInodeIdStep {
log.LogInfof("split[checkStatus],change state,id:%v,status:%v,replicaNum:%v,replicas:%v,persistenceHosts:%v, inodeCount:%v, MaxInodeID:%v, start:%v, end:%v",
mp.PartitionID, mp.Status, mp.ReplicaNum, len(liveReplicas), mp.Hosts, mp.InodeCount, mp.MaxInodeID, mp.Start, mp.End)
mp.Status = proto.ReadOnly
}
}
}
}
if mp.PartitionID >= maxPartitionID && mp.Status == proto.ReadOnly && !forbiddenVol {
mp.Status = proto.ReadWrite
}
if writeLog && len(liveReplicas) != int(mp.ReplicaNum) {
msg := fmt.Sprintf("action[checkMPStatus],id:%v,status:%v,replicaNum:%v,replicas:%v,persistenceHosts:%v",
mp.PartitionID, mp.Status, mp.ReplicaNum, len(liveReplicas), mp.Hosts)
log.LogInfo(msg)
Warn(clusterID, msg)
}
return
}
func (mp *MetaPartition) getMetaReplicaLeader() (mr *MetaReplica, err error) {
for _, mr = range mp.Replicas {
if mr.IsLeader {
return
}
}
err = proto.ErrNoLeader
return
}
func (mp *MetaPartition) checkReplicaNum(c *Cluster, volName string, replicaNum uint8) {
mp.RLock()
defer mp.RUnlock()
if mp.ReplicaNum != replicaNum {
msg := fmt.Sprintf("FIX MetaPartition replicaNum clusterID[%v] vol[%v] replica num[%v],current num[%v]",
c.Name, volName, replicaNum, mp.ReplicaNum)
Warn(c.Name, msg)
}
}
func (mp *MetaPartition) removeIllegalReplica() (excessAddr string, t *proto.AdminTask, err error) {
mp.RLock()
defer mp.RUnlock()
for _, mr := range mp.Replicas {
if !contains(mp.Hosts, mr.Addr) {
t = mr.createTaskToDeleteReplica(mp.PartitionID)
err = proto.ErrIllegalMetaReplica
break
}
}
return
}
func (mp *MetaPartition) missingReplicaAddrs() (lackAddrs []string) {
mp.RLock()
defer mp.RUnlock()
var liveReplicas []string
for _, mr := range mp.Replicas {
liveReplicas = append(liveReplicas, mr.Addr)
}
for _, host := range mp.Hosts {
if !contains(liveReplicas, host) {
lackAddrs = append(lackAddrs, host)
break
}
}
return
}
func (mp *MetaPartition) updateMetaPartition(mgr *proto.MetaPartitionReport, metaNode *MetaNode) {
if !contains(mp.Hosts, metaNode.Addr) {
return
}
mp.Lock()
defer mp.Unlock()
mr, err := mp.getMetaReplica(metaNode.Addr)
if err != nil {
mr = newMetaReplica(mp.Start, mp.End, metaNode)
mp.addReplica(mr)
}
mr.updateMetric(mgr)
if mr.IsLeader {
mp.LeaderReportTime = time.Now().Unix()
}
mp.setMaxInodeID()
mp.setInodeCount()
mp.setDentryCount()
mp.setFreeListLen()
mp.SetTxCnt()
mp.removeMissingReplica(metaNode.Addr)
mp.setUidInfo(mgr)
mp.setHeartBeatDone()
}
func (mp *MetaPartition) canBeOffline(nodeAddr string, replicaNum int) (err error) {
liveReplicas := mp.getLiveReplicas()
if len(liveReplicas) < int(mp.ReplicaNum/2+1) {
err = proto.ErrNoEnoughReplica
return
}
liveAddrs := mp.getLiveReplicasAddr(liveReplicas)
if len(liveReplicas) == (replicaNum/2+1) && contains(liveAddrs, nodeAddr) {
err = fmt.Errorf("live replicas num will be less than majority after offline nodeAddr: %v", nodeAddr)
return
}
return
}
// Check if there is a replica missing or not, exclude addr
func (mp *MetaPartition) hasMissingOneReplica(addr string, replicaNum int) (err error) {
inReplicas := false
for _, rep := range mp.Replicas {
if rep.Addr == addr {
inReplicas = true
break
}
}
hostNum := len(mp.Replicas)
if hostNum <= replicaNum-1 && inReplicas {
log.LogError(fmt.Sprintf("action[%v],partitionID:%v,err:%v",
"hasMissingOneReplica", mp.PartitionID, proto.ErrHasOneMissingReplica))
err = proto.ErrHasOneMissingReplica
}
return
}
func (mp *MetaPartition) getLiveReplicasAddr(liveReplicas []*MetaReplica) (addrs []string) {
addrs = make([]string, 0)
for _, mr := range liveReplicas {
addrs = append(addrs, mr.Addr)
}
return
}
func (mp *MetaPartition) getLiveReplicas() (liveReplicas []*MetaReplica) {
liveReplicas = make([]*MetaReplica, 0)
for _, mr := range mp.Replicas {
if mr.isActive() {
liveReplicas = append(liveReplicas, mr)
}
}
return
}
func (mp *MetaPartition) checkReplicas() {
for _, mr := range mp.Replicas {
if !mr.isActive() {
mr.Status = proto.Unavailable
}
}
return
}
func (mp *MetaPartition) persistToRocksDB(action, volName string, newHosts []string, newPeers []proto.Peer, c *Cluster) (err error) {
oldHosts := make([]string, len(mp.Hosts))
copy(oldHosts, mp.Hosts)
oldPeers := make([]proto.Peer, len(mp.Peers))
copy(oldPeers, mp.Peers)
mp.Hosts = newHosts
mp.Peers = newPeers
if err = c.syncUpdateMetaPartition(mp); err != nil {
mp.Hosts = oldHosts
mp.Peers = oldPeers
log.LogWarnf("action[%v_persist] failed,vol[%v] partitionID:%v old hosts:%v new hosts:%v oldPeers:%v newPeers:%v",
action, volName, mp.PartitionID, mp.Hosts, newHosts, mp.Peers, newPeers)
return
}
log.LogWarnf("action[%v_persist] success,vol[%v] partitionID:%v old hosts:%v new hosts:%v oldPeers:%v newPeers:%v ",
action, volName, mp.PartitionID, oldHosts, mp.Hosts, oldPeers, mp.Peers)
return
}
func (mp *MetaPartition) getActiveAddrs() (liveAddrs []string) {
liveAddrs = make([]string, 0)
for _, mr := range mp.Replicas {
if mr.isActive() {
liveAddrs = append(liveAddrs, mr.Addr)
}
}
return liveAddrs
}
func (mp *MetaPartition) isMissingReplica(addr string) bool {
return !contains(mp.getActiveAddrs(), addr)
}
func (mp *MetaPartition) shouldReportMissingReplica(addr string, interval int64) (isWarn bool) {
lastWarningTime, ok := mp.MissNodes[addr]
if !ok {
isWarn = true
mp.MissNodes[addr] = time.Now().Unix()
} else if (time.Now().Unix() - lastWarningTime) > interval {
isWarn = true
mp.MissNodes[addr] = time.Now().Unix()
}
return isWarn
// return false
}
func (mp *MetaPartition) reportMissingReplicas(clusterID, leaderAddr string, seconds, interval int64) {
mp.Lock()
defer mp.Unlock()
for _, replica := range mp.Replicas {
// reduce the alarm frequency
if contains(mp.Hosts, replica.Addr) && replica.isMissing() {
if mp.shouldReportMissingReplica(replica.Addr, interval) {
metaNode := replica.metaNode
var lastReportTime time.Time
isActive := true
if metaNode != nil {
lastReportTime = metaNode.ReportTime
isActive = metaNode.IsActive
}
msg := fmt.Sprintf("action[reportMissingReplicas], clusterID[%v] volName[%v] partition:%v on node:%v "+
"miss time > :%v vlocLastRepostTime:%v dnodeLastReportTime:%v nodeisActive:%v",
clusterID, mp.volName, mp.PartitionID, replica.Addr, seconds, replica.ReportTime, lastReportTime, isActive)
Warn(clusterID, msg)
// msg = fmt.Sprintf("decommissionMetaPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, mp.PartitionID, replica.Addr)
// Warn(clusterID, msg)
if WarnMetrics != nil {
WarnMetrics.WarnMissingMp(clusterID, replica.Addr, mp.PartitionID, true)
}
}
} else {
if WarnMetrics != nil {
WarnMetrics.WarnMissingMp(clusterID, replica.Addr, mp.PartitionID, false)
}
}
}
if WarnMetrics != nil {
WarnMetrics.CleanObsoleteMpMissing(clusterID, mp)
}
for _, addr := range mp.Hosts {
if mp.isMissingReplica(addr) && mp.shouldReportMissingReplica(addr, interval) {
msg := fmt.Sprintf("action[reportMissingReplicas],clusterID[%v] volName[%v] partition:%v on node:%v "+
"miss time > %v ",
clusterID, mp.volName, mp.PartitionID, addr, defaultMetaPartitionTimeOutSec)
Warn(clusterID, msg)
msg = fmt.Sprintf("decommissionMetaPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, mp.PartitionID, addr)
Warn(clusterID, msg)
}
}
}
func (mp *MetaPartition) replicaCreationTasks(clusterID, volName string) (tasks []*proto.AdminTask) {
var msg string
tasks = make([]*proto.AdminTask, 0)
if addr, _, err := mp.removeIllegalReplica(); err != nil {
msg = fmt.Sprintf("action[%v],clusterID[%v] metaPartition:%v excess replication"+
" on :%v err:%v persistenceHosts:%v",
deleteIllegalReplicaErr, clusterID, mp.PartitionID, addr, err.Error(), mp.Hosts)
log.LogWarn(msg)
}
if addrs := mp.missingReplicaAddrs(); addrs != nil {
msg = fmt.Sprintf("action[missingReplicaAddrs],clusterID[%v] metaPartition:%v lack replication"+
" on :%v Hosts:%v",
clusterID, mp.PartitionID, addrs, mp.Hosts)
Warn(clusterID, msg)
}
return
}
func (mp *MetaPartition) buildNewMetaPartitionTasks(specifyAddrs []string, peers []proto.Peer, volName string) (tasks []*proto.AdminTask) {
tasks = make([]*proto.AdminTask, 0)
hosts := make([]string, 0)
req := &proto.CreateMetaPartitionRequest{
Start: mp.Start,
End: mp.End,
PartitionID: mp.PartitionID,
Members: peers,
VolName: volName,
VerSeq: mp.VerSeq,
}
if specifyAddrs == nil {
hosts = mp.Hosts
} else {
hosts = specifyAddrs
}
for _, addr := range hosts {
t := proto.NewAdminTask(proto.OpCreateMetaPartition, addr, req)
resetMetaPartitionTaskID(t, mp.PartitionID)
tasks = append(tasks, t)
}
return
}
func (mp *MetaPartition) tryToChangeLeader(c *Cluster, metaNode *MetaNode) (err error) {
task, err := mp.createTaskToTryToChangeLeader(metaNode.Addr)
if err != nil {
return
}
if _, err = metaNode.Sender.syncSendAdminTask(task); err != nil {
return
}
return
}
func (mp *MetaPartition) tryToChangeLeaderByHost(host string) (err error) {
var metaNode *MetaNode
for _, r := range mp.Replicas {
if host == r.Addr {
metaNode = r.metaNode
break
}
}
if metaNode == nil {
return fmt.Errorf("host not found[%v]", host)
}
task, err := mp.createTaskToTryToChangeLeader(host)
if err != nil {
return
}
if _, err = metaNode.Sender.syncSendAdminTask(task); err != nil {
return
}
return
}
func (mp *MetaPartition) createTaskToTryToChangeLeader(addr string) (task *proto.AdminTask, err error) {
task = proto.NewAdminTask(proto.OpMetaPartitionTryToLeader, addr, nil)
resetMetaPartitionTaskID(task, mp.PartitionID)
return
}
func (mp *MetaPartition) createTaskToCreateReplica(host string) (t *proto.AdminTask, err error) {
req := &proto.CreateMetaPartitionRequest{
Start: mp.Start,
End: mp.End,
PartitionID: mp.PartitionID,
Members: mp.Peers,
VolName: mp.volName,
VerSeq: mp.VerSeq,
}
t = proto.NewAdminTask(proto.OpCreateMetaPartition, host, req)
resetMetaPartitionTaskID(t, mp.PartitionID)
return
}
func (mp *MetaPartition) createTaskToAddRaftMember(addPeer proto.Peer, leaderAddr string) (t *proto.AdminTask, err error) {
req := &proto.AddMetaPartitionRaftMemberRequest{PartitionId: mp.PartitionID, AddPeer: addPeer}
t = proto.NewAdminTask(proto.OpAddMetaPartitionRaftMember, leaderAddr, req)
resetMetaPartitionTaskID(t, mp.PartitionID)
return
}
func (mp *MetaPartition) createTaskToRemoveRaftMember(removePeer proto.Peer) (t *proto.AdminTask, err error) {
mr, err := mp.getMetaReplicaLeader()
if err != nil {
return nil, errors.NewError(err)
}
req := &proto.RemoveMetaPartitionRaftMemberRequest{PartitionId: mp.PartitionID, RemovePeer: removePeer}
t = proto.NewAdminTask(proto.OpRemoveMetaPartitionRaftMember, mr.Addr, req)
resetMetaPartitionTaskID(t, mp.PartitionID)
return
}
func (mp *MetaPartition) createTaskToDecommissionReplica(volName string, removePeer proto.Peer, addPeer proto.Peer) (t *proto.AdminTask, err error) {
mr, err := mp.getMetaReplicaLeader()
if err != nil {
return nil, errors.NewError(err)
}
req := &proto.MetaPartitionDecommissionRequest{PartitionID: mp.PartitionID, VolName: volName, RemovePeer: removePeer, AddPeer: addPeer}
t = proto.NewAdminTask(proto.OpDecommissionMetaPartition, mr.Addr, req)
resetMetaPartitionTaskID(t, mp.PartitionID)
return
}
func resetMetaPartitionTaskID(t *proto.AdminTask, partitionID uint64) {
t.ID = fmt.Sprintf("%v_pid[%v]", t.ID, partitionID)
t.PartitionID = partitionID
}
func (mp *MetaPartition) createTaskToUpdateMetaReplica(clusterID string, partitionID uint64, end uint64) (t *proto.AdminTask) {
mr, err := mp.getMetaReplicaLeader()
if err != nil {
msg := fmt.Sprintf("action[createTaskToUpdateMetaReplica] clusterID[%v] meta partition %v no leader",
clusterID, mp.PartitionID)
Warn(clusterID, msg)
return
}
req := &proto.UpdateMetaPartitionRequest{PartitionID: partitionID, End: end, VolName: mp.volName}
t = proto.NewAdminTask(proto.OpUpdateMetaPartition, mr.Addr, req)
resetMetaPartitionTaskID(t, mp.PartitionID)
return
}
func (mr *MetaReplica) createTaskToDeleteReplica(partitionID uint64) (t *proto.AdminTask) {
req := &proto.DeleteMetaPartitionRequest{PartitionID: partitionID}
t = proto.NewAdminTask(proto.OpDeleteMetaPartition, mr.Addr, req)
resetMetaPartitionTaskID(t, partitionID)
return
}
func (mr *MetaReplica) createTaskToLoadMetaPartition(partitionID uint64) (t *proto.AdminTask) {
req := &proto.MetaPartitionLoadRequest{PartitionID: partitionID}
t = proto.NewAdminTask(proto.OpLoadMetaPartition, mr.Addr, req)
resetMetaPartitionTaskID(t, partitionID)
return
}
func (mr *MetaReplica) isMissing() (miss bool) {
return time.Now().Unix()-mr.ReportTime > defaultMetaPartitionTimeOutSec
}
func (mr *MetaReplica) isActive() (active bool) {
return mr.metaNode.IsActive && mr.Status != proto.Unavailable &&
time.Now().Unix()-mr.ReportTime < defaultMetaPartitionTimeOutSec
}
func (mr *MetaReplica) setLastReportTime() {
mr.ReportTime = time.Now().Unix()
}
func (mr *MetaReplica) updateMetric(mgr *proto.MetaPartitionReport) {
mr.Status = (int8)(mgr.Status)
mr.IsLeader = mgr.IsLeader
mr.MaxInodeID = mgr.MaxInodeID
mr.InodeCount = mgr.InodeCnt
mr.DentryCount = mgr.DentryCnt
mr.TxCnt = mgr.TxCnt
mr.TxRbInoCnt = mgr.TxRbInoCnt
mr.TxRbDenCnt = mgr.TxRbDenCnt
mr.FreeListLen = mgr.FreeListLen
mr.dataSize = mgr.Size
mr.setLastReportTime()
if mr.metaNode.RdOnly && mr.Status == proto.ReadWrite {
mr.Status = proto.ReadOnly
}
}
func (mp *MetaPartition) afterCreation(nodeAddr string, c *Cluster) (err error) {
metaNode, err := c.metaNode(nodeAddr)
if err != nil {
return err
}
mr := newMetaReplica(mp.Start, mp.End, metaNode)
mr.Status = proto.ReadWrite
mr.ReportTime = time.Now().Unix()
mp.addReplica(mr)
mp.removeMissingReplica(mr.Addr)
return
}
func (mp *MetaPartition) addOrReplaceLoadResponse(response *proto.MetaPartitionLoadResponse) {
mp.Lock()
defer mp.Unlock()
loadResponse := make([]*proto.MetaPartitionLoadResponse, 0)
for _, lr := range mp.LoadResponse {
if lr.Addr == response.Addr {
continue
}
loadResponse = append(loadResponse, lr)
}
loadResponse = append(loadResponse, response)
mp.LoadResponse = loadResponse
}
func (mp *MetaPartition) getMinusOfMaxInodeID() (minus float64) {
mp.RLock()
defer mp.RUnlock()
var sentry float64
for index, replica := range mp.Replicas {
if index == 0 {
sentry = float64(replica.MaxInodeID)
continue
}
diff := math.Abs(float64(replica.MaxInodeID) - sentry)
if diff > minus {
minus = diff
}
}
return
}
func (mp *MetaPartition) activeMaxInodeSimilar() bool {
mp.RLock()
defer mp.RUnlock()
minus := float64(0)
var sentry float64
replicas := mp.getLiveReplicas()
for index, replica := range replicas {
if index == 0 {
sentry = float64(replica.MaxInodeID)
continue
}
diff := math.Abs(float64(replica.MaxInodeID) - sentry)
if diff > minus {
minus = diff
}
}
return minus < defaultMinusOfMaxInodeID
}
func (mp *MetaPartition) setUidInfo(mgr *proto.MetaPartitionReport) {
if !mgr.IsLeader {
return
}
mp.uidInfo = mgr.UidInfo
}
func (mp *MetaPartition) setMaxInodeID() {
var maxUsed uint64
for _, r := range mp.Replicas {
if r.MaxInodeID > maxUsed {
maxUsed = r.MaxInodeID
}
}
mp.MaxInodeID = maxUsed
}
// Caller should call mp.lock and mp.unlock when use it.
func (mp *MetaPartition) setHeartBeatDone() {
if len(mp.Replicas) == int(mp.ReplicaNum) {
mp.heartBeatDone = true
}
}
func (mp *MetaPartition) setInodeCount() {
var inodeCount uint64
for _, r := range mp.Replicas {
if r.InodeCount > inodeCount {
inodeCount = r.InodeCount
}
}
mp.InodeCount = inodeCount
}
func (mp *MetaPartition) setDentryCount() {
var dentryCount uint64
for _, r := range mp.Replicas {
if r.DentryCount > dentryCount {
dentryCount = r.DentryCount
}
}
mp.DentryCount = dentryCount
}
func (mp *MetaPartition) setFreeListLen() {
var freeListLen uint64
for _, r := range mp.Replicas {
if r.FreeListLen > freeListLen {
freeListLen = r.FreeListLen
}
}
mp.FreeListLen = freeListLen
}
func (mp *MetaPartition) SetTxCnt() {
var txCnt, rbInoCnt, rbDenCnt uint64
for _, r := range mp.Replicas {
if r.TxCnt > txCnt {
txCnt = r.TxCnt
}
if r.TxRbInoCnt > rbInoCnt {
rbInoCnt = r.TxRbInoCnt
}
if r.TxRbDenCnt > rbDenCnt {
rbDenCnt = r.TxRbDenCnt
}
}
mp.TxCnt, mp.TxRbInoCnt, mp.TxRbDenCnt = txCnt, rbInoCnt, rbDenCnt
}
func (mp *MetaPartition) getAllNodeSets() (nodeSets []uint64) {
mp.RLock()
defer mp.RUnlock()
nodeSets = make([]uint64, 0)
for _, mr := range mp.Replicas {
if mr.metaNode == nil {
continue
}
if !containsID(nodeSets, mr.metaNode.NodeSetID) {
nodeSets = append(nodeSets, mr.metaNode.NodeSetID)
}
}
return
}
func (mp *MetaPartition) getLiveZones(offlineAddr string) (zones []string) {
mp.RLock()
defer mp.RUnlock()
for _, mr := range mp.Replicas {
if mr.metaNode == nil {
continue
}
if mr.Addr == offlineAddr {
continue
}
zones = append(zones, mr.metaNode.ZoneName)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"time"
"github.com/cubefs/cubefs/util/log"
)
func (c *Cluster) scheduleToLoadMetaPartitions() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
if c.vols != nil {
c.checkLoadMetaPartitions()
}
}
time.Sleep(2 * time.Second * defaultIntervalToCheckDataPartition)
}
}()
}
func (c *Cluster) checkLoadMetaPartitions() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkDiskRecoveryProgress occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkDiskRecoveryProgress occurred panic")
}
}()
vols := c.allVols()
for _, vol := range vols {
mps := vol.cloneMetaPartitionMap()
for _, mp := range mps {
c.doLoadMetaPartition(mp)
}
}
}
func (mp *MetaPartition) checkSnapshot(c *Cluster) {
if len(mp.LoadResponse) == 0 {
return
}
if !mp.doCompare() {
return
}
if !mp.isSameApplyID() {
return
}
ckInode := mp.checkInodeCount(c)
ckDentry := mp.checkDentryCount(c)
if ckInode && ckDentry {
mp.EqualCheckPass = true
} else {
mp.EqualCheckPass = false
}
}
func (mp *MetaPartition) doCompare() bool {
for _, lr := range mp.LoadResponse {
if !lr.DoCompare {
return false
}
}
return true
}
func (mp *MetaPartition) isSameApplyID() bool {
rst := true
applyID := mp.LoadResponse[0].ApplyID
for _, loadResponse := range mp.LoadResponse {
if applyID != loadResponse.ApplyID {
rst = false
}
}
return rst
}
func (mp *MetaPartition) checkInodeCount(c *Cluster) (isEqual bool) {
isEqual = true
maxInode := mp.LoadResponse[0].MaxInode
maxInodeCount := mp.LoadResponse[0].InodeCount
inodeEqual := true
maxInodeEqual := true
if mp.IsRecover {
return
}
for _, loadResponse := range mp.LoadResponse {
diff := math.Abs(float64(loadResponse.MaxInode) - float64(maxInode))
if diff > defaultRangeOfCountDifferencesAllowed {
isEqual = false
inodeEqual = false
break
}
diff = math.Abs(float64(loadResponse.InodeCount) - float64(maxInodeCount))
if diff > defaultRangeOfCountDifferencesAllowed {
isEqual = false
maxInodeEqual = false
break
}
}
if !isEqual {
msg := fmt.Sprintf("inode count is not equal,vol[%v],mpID[%v],", mp.volName, mp.PartitionID)
for _, lr := range mp.LoadResponse {
lrMsg := fmt.Sprintf(msg+lr.Addr, "applyId[%d],committedId[%d],maxInode[%d],InodeCnt[%d]", lr.ApplyID, lr.CommittedID, lr.MaxInode, lr.InodeCount)
Warn(c.Name, lrMsg)
}
if !maxInodeEqual {
c.inodeCountNotEqualMP.Store(mp.PartitionID, mp)
}
if !inodeEqual {
c.maxInodeNotEqualMP.Store(mp.PartitionID, mp)
}
} else {
if _, ok := c.inodeCountNotEqualMP.Load(mp.PartitionID); ok {
c.inodeCountNotEqualMP.Delete(mp.PartitionID)
}
if _, ok := c.maxInodeNotEqualMP.Load(mp.PartitionID); ok {
c.maxInodeNotEqualMP.Delete(mp.PartitionID)
}
}
return
}
func (mp *MetaPartition) checkDentryCount(c *Cluster) (isEqual bool) {
isEqual = true
if mp.IsRecover {
return
}
dentryCount := mp.LoadResponse[0].DentryCount
for _, loadResponse := range mp.LoadResponse {
diff := math.Abs(float64(loadResponse.DentryCount) - float64(dentryCount))
if diff > defaultRangeOfCountDifferencesAllowed {
isEqual = false
}
}
if !isEqual {
msg := fmt.Sprintf("dentry count is not equal,vol[%v],mpID[%v],", mp.volName, mp.PartitionID)
for _, lr := range mp.LoadResponse {
lrMsg := fmt.Sprintf(msg+lr.Addr, "applyId[%d],committedId[%d],dentryCount[%d]", lr.ApplyID, lr.CommittedID, lr.DentryCount)
Warn(c.Name, lrMsg)
}
c.dentryCountNotEqualMP.Store(mp.PartitionID, mp)
} else {
if _, ok := c.dentryCountNotEqualMP.Load(mp.PartitionID); ok {
c.dentryCountNotEqualMP.Delete(mp.PartitionID)
}
}
return
}
func (c *Cluster) scheduleToCheckMetaPartitionRecoveryProgress() {
go func() {
for {
if c.partition != nil && c.partition.IsRaftLeader() {
if c.vols != nil {
c.checkMetaPartitionRecoveryProgress()
}
}
time.Sleep(time.Second * defaultIntervalToCheckDataPartition)
}
}()
}
func (c *Cluster) checkMetaPartitionRecoveryProgress() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkMetaPartitionRecoveryProgress occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkMetaPartitionRecoveryProgress occurred panic")
}
}()
c.badPartitionMutex.Lock()
defer c.badPartitionMutex.Unlock()
c.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
badMetaPartitionIds := value.([]uint64)
newBadMpIds := make([]uint64, 0)
for _, partitionID := range badMetaPartitionIds {
partition, err := c.getMetaPartitionByID(partitionID)
if err != nil {
Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v], partitionID[%v] is not exist", c.Name, partitionID))
continue
}
vol, err := c.getVol(partition.volName)
if err != nil {
Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v],vol[%v] partitionID[%v]is not exist",
c.Name, partition.volName, partitionID))
continue
}
if len(partition.Replicas) == 0 || len(partition.Replicas) < int(vol.mpReplicaNum) {
newBadMpIds = append(newBadMpIds, partitionID)
continue
}
if partition.getMinusOfMaxInodeID() < defaultMinusOfMaxInodeID {
partition.IsRecover = false
partition.RLock()
c.syncUpdateMetaPartition(partition)
partition.RUnlock()
Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v],vol[%v] partitionID[%v] has recovered success",
c.Name, partition.volName, partitionID))
} else {
newBadMpIds = append(newBadMpIds, partitionID)
}
}
if len(newBadMpIds) == 0 {
Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v],node[%v] has recovered success", c.Name, key))
c.BadMetaPartitionIds.Delete(key)
} else {
c.BadMetaPartitionIds.Store(key, newBadMpIds)
log.LogInfof("checkMetaPartitionRecoveryProgress BadMetaPartitionIds there is still (%d) mp in recover, addr (%s)", len(newBadMpIds), key)
}
return true
})
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"io"
"strconv"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
raftstore "github.com/cubefs/cubefs/raftstore/raftstore_db"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
const (
applied = "applied"
)
type raftLeaderChangeHandler func(leader uint64)
type raftPeerChangeHandler func(confChange *proto.ConfChange) (err error)
type raftUserCmdApplyHandler func(opt uint32, key string, cmdMap map[string][]byte) (err error)
type raftApplySnapshotHandler func()
// MetadataFsm represents the finite state machine of a metadata partition
type MetadataFsm struct {
store *raftstore.RocksDBStore
rs *raft.RaftServer
applied uint64
retainLogs uint64
leaderChangeHandler raftLeaderChangeHandler
peerChangeHandler raftPeerChangeHandler
snapshotHandler raftApplySnapshotHandler
UserAppCmdHandler raftUserCmdApplyHandler
onSnapshot bool
}
func newMetadataFsm(store *raftstore.RocksDBStore, retainsLog uint64, rs *raft.RaftServer) (fsm *MetadataFsm) {
fsm = new(MetadataFsm)
fsm.store = store
fsm.rs = rs
fsm.retainLogs = retainsLog
return
}
// Corresponding to the LeaderChange interface in Raft library.
func (mf *MetadataFsm) registerLeaderChangeHandler(handler raftLeaderChangeHandler) {
mf.leaderChangeHandler = handler
}
// Corresponding to the PeerChange interface in Raft library.
func (mf *MetadataFsm) registerPeerChangeHandler(handler raftPeerChangeHandler) {
mf.peerChangeHandler = handler
}
// Corresponding to the ApplySnapshot interface in Raft library.
func (mf *MetadataFsm) registerApplySnapshotHandler(handler raftApplySnapshotHandler) {
mf.snapshotHandler = handler
}
// Corresponding to the ApplyRaftCmd interface in Raft library.
func (mf *MetadataFsm) registerRaftUserCmdApplyHandler(handler raftUserCmdApplyHandler) {
mf.UserAppCmdHandler = handler
}
func (mf *MetadataFsm) restore() {
mf.restoreApplied()
}
func (mf *MetadataFsm) restoreApplied() {
value, err := mf.store.Get(applied)
if err != nil {
panic(fmt.Sprintf("Failed to restore applied err:%v", err.Error()))
}
byteValues := value.([]byte)
if len(byteValues) == 0 {
mf.applied = 0
return
}
applied, err := strconv.ParseUint(string(byteValues), 10, 64)
if err != nil {
panic(fmt.Sprintf("Failed to restore applied,err:%v ", err.Error()))
}
mf.applied = applied
}
// Apply implements the interface of raft.StateMachine
func (mf *MetadataFsm) Apply(command []byte, index uint64) (resp interface{}, err error) {
log.LogDebugf("[Apply] apply index(%v)", index)
cmd := new(RaftCmd)
if err = cmd.Unmarshal(command); err != nil {
log.LogErrorf("action[fsmApply],unmarshal data:%v, err:%v", command, err.Error())
panic(err)
}
cmdMap := make(map[string][]byte)
deleteSet := make(map[string]util.Null)
if cmd.Op != opSyncBatchPut {
cmdMap[cmd.K] = cmd.V
cmdMap[applied] = []byte(strconv.FormatUint(uint64(index), 10))
} else {
nestedCmdMap := make(map[string]*RaftCmd)
if err = json.Unmarshal(cmd.V, &nestedCmdMap); err != nil {
log.LogErrorf("action[fsmApply],unmarshal nested cmd data:%v, err:%v", command, err.Error())
panic(err)
}
for cmdK, cmd := range nestedCmdMap {
switch cmd.Op {
case opSyncDeleteDataNode, opSyncDeleteMetaNode, opSyncDeleteVol, opSyncDeleteDataPartition, opSyncDeleteMetaPartition,
opSyncDeleteUserInfo, opSyncDeleteAKUser, opSyncDeleteVolUser, opSyncDeleteQuota, opSyncDeleteLcNode, opSyncDeleteLcConf, opSyncS3QosDelete:
deleteSet[cmdK] = util.Null{}
// NOTE: opSyncPutFollowerApiLimiterInfo, opSyncPutApiLimiterInfo need special handle?
default:
cmdMap[cmdK] = cmd.V
}
}
cmdMap[applied] = []byte(strconv.FormatUint(uint64(index), 10))
}
switch cmd.Op {
case opSyncDeleteDataNode, opSyncDeleteMetaNode, opSyncDeleteVol, opSyncDeleteDataPartition, opSyncDeleteMetaPartition,
opSyncDeleteUserInfo, opSyncDeleteAKUser, opSyncDeleteVolUser, opSyncDeleteQuota, opSyncDeleteLcNode, opSyncDeleteLcConf, opSyncS3QosDelete:
if err = mf.delKeyAndPutIndex(cmd.K, cmdMap); err != nil {
panic(err)
}
case opSyncPutFollowerApiLimiterInfo, opSyncPutApiLimiterInfo:
mf.UserAppCmdHandler(cmd.Op, cmd.K, cmdMap)
//if err = mf.delKeyAndPutIndex(cmd.K, cmdMap); err != nil {
// panic(err)
//}
if err = mf.store.BatchPut(cmdMap, true); err != nil {
panic(err)
}
default:
// sync put data
if err = mf.store.BatchDeleteAndPut(deleteSet, cmdMap, true); err != nil {
panic(err)
}
}
mf.applied = index
if mf.applied > 0 && (mf.applied%mf.retainLogs) == 0 {
log.LogWarnf("action[Apply],truncate raft log,retainLogs[%v],index[%v]", mf.retainLogs, mf.applied)
mf.rs.Truncate(GroupID, mf.applied)
}
return
}
// ApplyMemberChange implements the interface of raft.StateMachine
func (mf *MetadataFsm) ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error) {
var err error
if mf.peerChangeHandler != nil {
err = mf.peerChangeHandler(confChange)
}
return nil, err
}
// Snapshot implements the interface of raft.StateMachine
func (mf *MetadataFsm) Snapshot() (proto.Snapshot, error) {
snapshot := mf.store.RocksDBSnapshot()
iterator := mf.store.Iterator(snapshot)
iterator.SeekToFirst()
return &MetadataSnapshot{
applied: mf.applied,
snapshot: snapshot,
fsm: mf,
iterator: iterator,
}, nil
}
// ApplySnapshot implements the interface of raft.StateMachine
func (mf *MetadataFsm) ApplySnapshot(peers []proto.Peer, iterator proto.SnapIterator) (err error) {
log.LogWarnf("action[ApplySnapshot] reset rocksdb before applying snapshot")
mf.onSnapshot = true
defer func() {
mf.onSnapshot = false
}()
if log.EnableDebug() {
func() {
snap := mf.store.RocksDBSnapshot()
defer mf.store.ReleaseSnapshot(snap)
iter := mf.store.Iterator(snap)
defer iter.Close()
cnt := 0
for iter.SeekToFirst(); iter.Valid(); iter.Next() {
cnt++
}
log.LogDebugf("[ApplySnapshot] scan %v keys before clear", cnt)
}()
}
mf.store.Clear()
if log.EnableDebug() {
func() {
snap := mf.store.RocksDBSnapshot()
defer mf.store.ReleaseSnapshot(snap)
iter := mf.store.Iterator(snap)
defer iter.Close()
cnt := 0
for iter.SeekToFirst(); iter.Valid(); iter.Next() {
cnt++
}
log.LogDebugf("[ApplySnapshot] scan %v keys after clear", cnt)
}()
}
log.LogWarnf(fmt.Sprintf("action[ApplySnapshot] begin,applied[%v]", mf.applied))
var data []byte
var appliedIndex []byte
for err == nil {
bgTime := stat.BeginStat()
if data, err = iterator.Next(); err != nil {
break
}
stat.EndStat("ApplySnapshot-Next", err, bgTime, 1)
cmd := &RaftCmd{}
if err = json.Unmarshal(data, cmd); err != nil {
goto errHandler
}
bgTime = stat.BeginStat()
if cmd.K != applied {
if _, err = mf.store.Put(cmd.K, cmd.V, false); err != nil {
goto errHandler
}
} else {
appliedIndex = cmd.V
}
stat.EndStat("ApplySnapshot-Put", err, bgTime, 1)
}
if err != nil && err != io.EOF {
goto errHandler
}
if err = mf.store.Flush(); err != nil {
log.LogError(fmt.Sprintf("action[ApplySnapshot] Flush failed,err:%v", err.Error()))
goto errHandler
}
// NOTE: we write applied index at last
log.LogDebugf("[ApplySnapshot] find applied index(%v)", appliedIndex)
if appliedIndex != nil {
if _, err = mf.store.Put(applied, appliedIndex, true); err != nil {
goto errHandler
}
} else {
log.LogErrorf("[ApplySnapshot] not found applied index in snapshot")
}
mf.snapshotHandler()
log.LogWarnf(fmt.Sprintf("action[ApplySnapshot] success,applied[%v]", mf.applied))
return nil
errHandler:
log.LogError(fmt.Sprintf("action[ApplySnapshot] failed,err:%v", err.Error()))
return err
}
// HandleFatalEvent implements the interface of raft.StateMachine
func (mf *MetadataFsm) HandleFatalEvent(err *raft.FatalError) {
panic(err.Err)
}
// HandleLeaderChange implements the interface of raft.StateMachine
func (mf *MetadataFsm) HandleLeaderChange(leader uint64) {
if mf.leaderChangeHandler != nil {
go mf.leaderChangeHandler(leader)
}
}
func (mf *MetadataFsm) delKeyAndPutIndex(key string, cmdMap map[string][]byte) (err error) {
return mf.store.DeleteKeyAndPutIndex(key, cmdMap, true)
}
// Stop stops the RaftServer
func (mf *MetadataFsm) Stop() {
if mf.rs != nil {
mf.rs.Stop()
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"strconv"
"strings"
"sync/atomic"
"time"
"golang.org/x/time/rate"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
bsProto "github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
/* We defines several "values" such as clusterValue, metaPartitionValue, dataPartitionValue, volValue, dataNodeValue,
nodeSetValue, and metaNodeValue here. Those are the value objects that will be marshaled as a byte array to
transferred over the network. */
type clusterValue struct {
Name string
CreateTime int64
Threshold float32
LoadFactor float32
DisableAutoAllocate bool
ForbidMpDecommission bool
DataNodeDeleteLimitRate uint64
MetaNodeDeleteBatchCount uint64
MetaNodeDeleteWorkerSleepMs uint64
DataNodeAutoRepairLimitRate uint64
MaxDpCntLimit uint64
FaultDomain bool
DiskQosEnable bool
QosLimitUpload uint64
DirChildrenNumLimit uint32
DecommissionLimit uint64
CheckDataReplicasEnable bool
FileStatsEnable bool
ClusterUuid string
ClusterUuidEnable bool
MetaPartitionInodeIdStep uint64
MaxConcurrentLcNodes uint64
DpMaxRepairErrCnt uint64
DpRepairTimeOut uint64
EnableAutoDecommissionDisk bool
DecommissionDiskFactor float64
}
func newClusterValue(c *Cluster) (cv *clusterValue) {
cv = &clusterValue{
Name: c.Name,
CreateTime: c.CreateTime,
LoadFactor: c.cfg.ClusterLoadFactor,
Threshold: c.cfg.MetaNodeThreshold,
DataNodeDeleteLimitRate: c.cfg.DataNodeDeleteLimitRate,
MetaNodeDeleteBatchCount: c.cfg.MetaNodeDeleteBatchCount,
MetaNodeDeleteWorkerSleepMs: c.cfg.MetaNodeDeleteWorkerSleepMs,
DataNodeAutoRepairLimitRate: c.cfg.DataNodeAutoRepairLimitRate,
DisableAutoAllocate: c.DisableAutoAllocate,
ForbidMpDecommission: c.ForbidMpDecommission,
MaxDpCntLimit: c.cfg.MaxDpCntLimit,
FaultDomain: c.FaultDomain,
DiskQosEnable: c.diskQosEnable,
QosLimitUpload: uint64(c.QosAcceptLimit.Limit()),
DirChildrenNumLimit: c.cfg.DirChildrenNumLimit,
DecommissionLimit: c.DecommissionLimit,
CheckDataReplicasEnable: c.checkDataReplicasEnable,
FileStatsEnable: c.fileStatsEnable,
ClusterUuid: c.clusterUuid,
ClusterUuidEnable: c.clusterUuidEnable,
MetaPartitionInodeIdStep: c.cfg.MetaPartitionInodeIdStep,
MaxConcurrentLcNodes: c.cfg.MaxConcurrentLcNodes,
DpMaxRepairErrCnt: c.cfg.DpMaxRepairErrCnt,
DpRepairTimeOut: c.cfg.DpRepairTimeOut,
EnableAutoDecommissionDisk: c.EnableAutoDecommissionDisk,
DecommissionDiskFactor: c.DecommissionDiskFactor,
}
return cv
}
type metaPartitionValue struct {
PartitionID uint64
Start uint64
End uint64
VolID uint64
ReplicaNum uint8
Status int8
VolName string
Hosts string
OfflinePeerID uint64
Peers []bsProto.Peer
IsRecover bool
}
func newMetaPartitionValue(mp *MetaPartition) (mpv *metaPartitionValue) {
mpv = &metaPartitionValue{
PartitionID: mp.PartitionID,
Start: mp.Start,
End: mp.End,
VolID: mp.volID,
ReplicaNum: mp.ReplicaNum,
Status: mp.Status,
VolName: mp.volName,
Hosts: mp.hostsToString(),
Peers: mp.Peers,
OfflinePeerID: mp.OfflinePeerID,
IsRecover: mp.IsRecover,
}
return
}
type dataPartitionValue struct {
PartitionID uint64
ReplicaNum uint8
Hosts string
Peers []bsProto.Peer
Status int8
VolID uint64
VolName string
OfflinePeerID uint64
Replicas []*replicaValue
IsRecover bool
PartitionType int
PartitionTTL int64
RdOnly bool
IsDiscard bool
DecommissionRetry int
DecommissionStatus uint32
DecommissionSrcAddr string
DecommissionDstAddr string
DecommissionRaftForce bool
DecommissionSrcDiskPath string
DecommissionTerm uint64
SpecialReplicaDecommissionStep uint32
DecommissionDstAddrSpecify bool
DecommissionNeedRollback bool
RecoverStartTime int64
RecoverLastConsumeTime float64
Forbidden bool
DecommissionWaitTimes int
}
func (dpv *dataPartitionValue) Restore(c *Cluster) (dp *DataPartition) {
for i := 0; i < len(dpv.Peers); i++ {
dn, ok := c.dataNodes.Load(dpv.Peers[i].Addr)
if ok && dn.(*DataNode).ID != dpv.Peers[i].ID {
dpv.Peers[i].ID = dn.(*DataNode).ID
}
}
dp = newDataPartition(dpv.PartitionID, dpv.ReplicaNum, dpv.VolName, dpv.VolID, dpv.PartitionType, dpv.PartitionTTL)
dp.Hosts = strings.Split(dpv.Hosts, underlineSeparator)
dp.Peers = dpv.Peers
dp.OfflinePeerID = dpv.OfflinePeerID
dp.isRecover = dpv.IsRecover
dp.RdOnly = dpv.RdOnly
dp.IsDiscard = dpv.IsDiscard
dp.DecommissionRaftForce = dpv.DecommissionRaftForce
dp.DecommissionDstAddr = dpv.DecommissionDstAddr
dp.DecommissionSrcAddr = dpv.DecommissionSrcAddr
dp.DecommissionRetry = dpv.DecommissionRetry
dp.DecommissionStatus = dpv.DecommissionStatus
dp.DecommissionSrcDiskPath = dpv.DecommissionSrcDiskPath
dp.DecommissionTerm = dpv.DecommissionTerm
dp.SpecialReplicaDecommissionStep = dpv.SpecialReplicaDecommissionStep
dp.DecommissionDstAddrSpecify = dpv.DecommissionDstAddrSpecify
dp.DecommissionNeedRollback = dpv.DecommissionNeedRollback
dp.RecoverStartTime = time.Unix(dpv.RecoverStartTime, 0)
dp.RecoverLastConsumeTime = time.Duration(dpv.RecoverLastConsumeTime) * time.Second
dp.DecommissionWaitTimes = dpv.DecommissionWaitTimes
for _, rv := range dpv.Replicas {
if !contains(dp.Hosts, rv.Addr) {
continue
}
dp.afterCreation(rv.Addr, rv.DiskPath, c)
}
return dp
}
type replicaValue struct {
Addr string
DiskPath string
}
func newDataPartitionValue(dp *DataPartition) (dpv *dataPartitionValue) {
dpv = &dataPartitionValue{
PartitionID: dp.PartitionID,
ReplicaNum: dp.ReplicaNum,
Hosts: dp.hostsToString(),
Peers: dp.Peers,
Status: dp.Status,
VolID: dp.VolID,
VolName: dp.VolName,
OfflinePeerID: dp.OfflinePeerID,
Replicas: make([]*replicaValue, 0),
IsRecover: dp.isRecover,
PartitionType: dp.PartitionType,
PartitionTTL: dp.PartitionTTL,
RdOnly: dp.RdOnly,
IsDiscard: dp.IsDiscard,
DecommissionRetry: dp.DecommissionRetry,
DecommissionStatus: dp.DecommissionStatus,
DecommissionSrcAddr: dp.DecommissionSrcAddr,
DecommissionDstAddr: dp.DecommissionDstAddr,
DecommissionRaftForce: dp.DecommissionRaftForce,
DecommissionSrcDiskPath: dp.DecommissionSrcDiskPath,
DecommissionTerm: dp.DecommissionTerm,
SpecialReplicaDecommissionStep: dp.SpecialReplicaDecommissionStep,
DecommissionDstAddrSpecify: dp.DecommissionDstAddrSpecify,
DecommissionNeedRollback: dp.DecommissionNeedRollback,
RecoverStartTime: dp.RecoverStartTime.Unix(),
RecoverLastConsumeTime: dp.RecoverLastConsumeTime.Seconds(),
DecommissionWaitTimes: dp.DecommissionWaitTimes,
}
for _, replica := range dp.Replicas {
rv := &replicaValue{Addr: replica.Addr, DiskPath: replica.DiskPath}
dpv.Replicas = append(dpv.Replicas, rv)
}
return
}
type volValue struct {
ID uint64
Name string
ReplicaNum uint8
DpReplicaNum uint8
Status uint8
DataPartitionSize uint64
Capacity uint64
Owner string
FollowerRead bool
Authenticate bool
DpReadOnlyWhenVolFull bool
CrossZone bool
DomainOn bool
ZoneName string
OSSAccessKey string
OSSSecretKey string
CreateTime int64
DeleteLockTime int64
Description string
DpSelectorName string
DpSelectorParm string
DefaultPriority bool
DomainId uint64
VolType int
EbsBlkSize int
CacheCapacity uint64
CacheAction int
CacheThreshold int
CacheTTL int
CacheHighWater int
CacheLowWater int
CacheLRUInterval int
CacheRule string
EnablePosixAcl bool
EnableQuota bool
EnableTransaction bsProto.TxOpMask
TxTimeout int64
TxConflictRetryNum int64
TxConflictRetryInterval int64
TxOpLimit int
VolQosEnable bool
DiskQosEnable bool
IopsRLimit, IopsWLimit, FlowRlimit, FlowWlimit uint64
IopsRMagnify, IopsWMagnify, FlowRMagnify, FlowWMagnify uint32
ClientReqPeriod, ClientHitTriggerCnt uint32
Forbidden bool
EnableAuditLog bool
}
func (v *volValue) Bytes() (raw []byte, err error) {
raw, err = json.Marshal(v)
return
}
func newVolValue(vol *Vol) (vv *volValue) {
vv = &volValue{
ID: vol.ID,
Name: vol.Name,
ReplicaNum: vol.mpReplicaNum,
DpReplicaNum: vol.dpReplicaNum,
Status: vol.Status,
DataPartitionSize: vol.dataPartitionSize,
Capacity: vol.Capacity,
Owner: vol.Owner,
FollowerRead: vol.FollowerRead,
Authenticate: vol.authenticate,
CrossZone: vol.crossZone,
DomainOn: vol.domainOn,
ZoneName: vol.zoneName,
OSSAccessKey: vol.OSSAccessKey,
OSSSecretKey: vol.OSSSecretKey,
CreateTime: vol.createTime,
DeleteLockTime: vol.DeleteLockTime,
Description: vol.description,
DpSelectorName: vol.dpSelectorName,
DpSelectorParm: vol.dpSelectorParm,
DefaultPriority: vol.defaultPriority,
EnablePosixAcl: vol.enablePosixAcl,
EnableQuota: vol.enableQuota,
EnableTransaction: vol.enableTransaction,
TxTimeout: vol.txTimeout,
TxConflictRetryNum: vol.txConflictRetryNum,
TxConflictRetryInterval: vol.txConflictRetryInterval,
TxOpLimit: vol.txOpLimit,
VolType: vol.VolType,
EbsBlkSize: vol.EbsBlkSize,
CacheCapacity: vol.CacheCapacity,
CacheAction: vol.CacheAction,
CacheThreshold: vol.CacheThreshold,
CacheTTL: vol.CacheTTL,
CacheHighWater: vol.CacheHighWater,
CacheLowWater: vol.CacheLowWater,
CacheLRUInterval: vol.CacheLRUInterval,
CacheRule: vol.CacheRule,
VolQosEnable: vol.qosManager.qosEnable,
IopsRLimit: vol.qosManager.getQosLimit(bsProto.IopsReadType),
IopsWLimit: vol.qosManager.getQosLimit(bsProto.IopsWriteType),
FlowRlimit: vol.qosManager.getQosLimit(bsProto.FlowReadType),
FlowWlimit: vol.qosManager.getQosLimit(bsProto.FlowWriteType),
IopsRMagnify: vol.qosManager.getQosMagnify(bsProto.IopsReadType),
IopsWMagnify: vol.qosManager.getQosMagnify(bsProto.IopsWriteType),
FlowRMagnify: vol.qosManager.getQosMagnify(bsProto.FlowReadType),
FlowWMagnify: vol.qosManager.getQosMagnify(bsProto.FlowWriteType),
ClientReqPeriod: vol.qosManager.ClientReqPeriod,
ClientHitTriggerCnt: vol.qosManager.ClientHitTriggerCnt,
DpReadOnlyWhenVolFull: vol.DpReadOnlyWhenVolFull,
Forbidden: vol.Forbidden,
EnableAuditLog: vol.EnableAuditLog,
}
return
}
func newVolValueFromBytes(raw []byte) (*volValue, error) {
vv := &volValue{}
if err := json.Unmarshal(raw, vv); err != nil {
return nil, err
}
return vv, nil
}
type dataNodeValue struct {
ID uint64
NodeSetID uint64
Addr string
ZoneName string
RdOnly bool
DecommissionedDisks []string
DecommissionStatus uint32
DecommissionDstAddr string
DecommissionRaftForce bool
DecommissionLimit int
DecommissionRetry uint8
DecommissionCompleteTime int64
ToBeOffline bool
DecommissionDiskList []string
DecommissionDpTotal int
}
func newDataNodeValue(dataNode *DataNode) *dataNodeValue {
return &dataNodeValue{
ID: dataNode.ID,
NodeSetID: dataNode.NodeSetID,
Addr: dataNode.Addr,
ZoneName: dataNode.ZoneName,
RdOnly: dataNode.RdOnly,
DecommissionedDisks: dataNode.getDecommissionedDisks(),
DecommissionStatus: atomic.LoadUint32(&dataNode.DecommissionStatus),
DecommissionDstAddr: dataNode.DecommissionDstAddr,
DecommissionRaftForce: dataNode.DecommissionRaftForce,
DecommissionLimit: dataNode.DecommissionLimit,
DecommissionRetry: dataNode.DecommissionRetry,
DecommissionCompleteTime: dataNode.DecommissionCompleteTime,
ToBeOffline: dataNode.ToBeOffline,
DecommissionDiskList: dataNode.DecommissionDiskList,
DecommissionDpTotal: dataNode.DecommissionDpTotal,
}
}
type metaNodeValue struct {
ID uint64
NodeSetID uint64
Addr string
ZoneName string
RdOnly bool
}
func newMetaNodeValue(metaNode *MetaNode) *metaNodeValue {
return &metaNodeValue{
ID: metaNode.ID,
NodeSetID: metaNode.NodeSetID,
Addr: metaNode.Addr,
ZoneName: metaNode.ZoneName,
RdOnly: metaNode.RdOnly,
}
}
type nodeSetValue struct {
ID uint64
Capacity int
ZoneName string
DataNodeSelector string
MetaNodeSelector string
}
type domainNodeSetGrpValue struct {
DomainId uint64
ID uint64
NodeSetsIds []uint64
Status uint8
}
type zoneDomainValue struct {
ExcludeZoneMap map[string]int
NeedFaultDomain bool
DataRatio float64
domainNodeSetGrpVec []*DomainNodeSetGrpManager
DomainZoneName2IdMap map[string]uint64 // zoneName:domainId
ExcludeZoneUseRatio float64
}
func newZoneDomainValue() (ev *zoneDomainValue) {
ev = &zoneDomainValue{
ExcludeZoneMap: make(map[string]int),
}
return
}
func newNodeSetValue(nset *nodeSet) (nsv *nodeSetValue) {
nsv = &nodeSetValue{
ID: nset.ID,
Capacity: nset.Capacity,
ZoneName: nset.zoneName,
DataNodeSelector: nset.GetDataNodeSelector(),
MetaNodeSelector: nset.GetMetaNodeSelector(),
}
return
}
func newNodeSetGrpValue(nset *nodeSetGroup) (nsv *domainNodeSetGrpValue) {
nsv = &domainNodeSetGrpValue{
DomainId: nset.domainId,
ID: nset.ID,
NodeSetsIds: nset.nodeSetsIds,
Status: nset.status,
}
return
}
// RaftCmd defines the Raft commands.
type RaftCmd struct {
Op uint32 `json:"op"`
K string `json:"k"`
V []byte `json:"v"`
}
// Marshal converts the RaftCmd to a byte array.
func (m *RaftCmd) Marshal() ([]byte, error) {
return json.Marshal(m)
}
// Unmarshal converts the byte array to a RaftCmd.
func (m *RaftCmd) Unmarshal(data []byte) (err error) {
return json.Unmarshal(data, m)
}
func (m *RaftCmd) setOpType() {
keyArr := strings.Split(m.K, keySeparator)
if len(keyArr) < 2 {
log.LogWarnf("action[setOpType] invalid length[%v]", keyArr)
return
}
switch keyArr[1] {
case metaNodeAcronym:
m.Op = opSyncAddMetaNode
case dataNodeAcronym:
m.Op = opSyncAddDataNode
case dataPartitionAcronym:
m.Op = opSyncAddDataPartition
case metaPartitionAcronym:
m.Op = opSyncAddMetaPartition
case volAcronym:
m.Op = opSyncAddVol
case clusterAcronym:
m.Op = opSyncPutCluster
case nodeSetAcronym:
m.Op = opSyncAddNodeSet
case maxDataPartitionIDKey:
m.Op = opSyncAllocDataPartitionID
case maxMetaPartitionIDKey:
m.Op = opSyncAllocMetaPartitionID
case maxCommonIDKey:
m.Op = opSyncAllocCommonID
case userAcronym:
m.Op = opSyncAddUserInfo
case akAcronym:
m.Op = opSyncAddAKUser
case volUserAcronym:
m.Op = opSyncAddVolUser
case lcNodeAcronym:
m.Op = opSyncAddLcNode
case lcConfigurationAcronym:
m.Op = opSyncAddLcConf
default:
log.LogWarnf("action[setOpType] unknown opCode[%v]", keyArr[1])
}
}
// key=#c#name
func (c *Cluster) syncPutCluster() (err error) {
metadata := new(RaftCmd)
metadata.Op = opSyncPutCluster
metadata.K = clusterPrefix + c.Name
cv := newClusterValue(c)
log.LogInfof("action[syncPutCluster] cluster value:[%+v]", cv)
metadata.V, err = json.Marshal(cv)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) syncPutApiLimiterInfo(followerLimiter bool) (err error) {
metadata := new(RaftCmd)
if followerLimiter {
metadata.Op = opSyncPutFollowerApiLimiterInfo
} else {
metadata.Op = opSyncPutApiLimiterInfo
}
metadata.K = apiLimiterPrefix + c.Name
c.apiLimiter.m.RLock()
metadata.V, err = json.Marshal(c.apiLimiter.limiterInfos)
c.apiLimiter.m.RUnlock()
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) loadApiLimiterInfo() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(apiLimiterPrefix))
if err != nil {
err = fmt.Errorf("action[loadApiLimiterInfo],err:%v", err.Error())
return err
}
for _, value := range result {
// cv := &clusterValue{}
limiterInfos := make(map[string]*ApiLimitInfo)
if err = json.Unmarshal(value, &limiterInfos); err != nil {
log.LogErrorf("action[loadApiLimiterInfo], unmarshal err:%v", err.Error())
return err
}
for _, v := range limiterInfos {
v.InitLimiter()
}
c.apiLimiter.m.Lock()
c.apiLimiter.limiterInfos = limiterInfos
c.apiLimiter.m.Unlock()
// c.apiLimiter.Replace(limiterInfos)
log.LogInfof("action[loadApiLimiterInfo], limiter info[%v]", value)
}
return
}
// key=#s#id
func (c *Cluster) syncAddNodeSet(nset *nodeSet) (err error) {
return c.putNodeSetInfo(opSyncAddNodeSet, nset)
}
func (c *Cluster) syncUpdateNodeSet(nset *nodeSet) (err error) {
return c.putNodeSetInfo(opSyncUpdateNodeSet, nset)
}
func (c *Cluster) putNodeSetInfo(opType uint32, nset *nodeSet) (err error) {
log.LogInfof("action[putNodeSetInfo], type:[%v], gridId:[%v], name:[%v]", opType, nset.ID, nset.zoneName)
metadata := new(RaftCmd)
metadata.Op = opType
metadata.K = nodeSetPrefix + strconv.FormatUint(nset.ID, 10)
nsv := newNodeSetValue(nset)
metadata.V, err = json.Marshal(nsv)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) putNodeSetGrpInfo(opType uint32, nsg *nodeSetGroup) (err error) {
metadata := new(RaftCmd)
metadata.Op = opType
metadata.K = nodeSetGrpPrefix + strconv.FormatUint(nsg.ID, 10)
log.LogInfof("action[putNodeSetGrpInfo] nsg id[%v] status[%v] ids[%v]", nsg.ID, nsg.status, nsg.nodeSetsIds)
nsv := newNodeSetGrpValue(nsg)
log.LogInfof("action[putNodeSetGrpInfo] nsv id[%v] status[%v] ids[%v]", nsv.ID, nsv.Status, nsv.NodeSetsIds)
metadata.V, err = json.Marshal(nsv)
if err != nil {
return
}
return c.submit(metadata)
}
// key=#dp#volID#partitionID,value=json.Marshal(dataPartitionValue)
func (c *Cluster) syncAddDataPartition(dp *DataPartition) (err error) {
return c.putDataPartitionInfo(opSyncAddDataPartition, dp)
}
func (c *Cluster) syncUpdateDataPartition(dp *DataPartition) (err error) {
return c.putDataPartitionInfo(opSyncUpdateDataPartition, dp)
}
func (c *Cluster) syncDeleteDataPartition(dp *DataPartition) (err error) {
return c.putDataPartitionInfo(opSyncDeleteDataPartition, dp)
}
func (c *Cluster) buildDataPartitionRaftCmd(opType uint32, dp *DataPartition) (metadata *RaftCmd, err error) {
metadata = new(RaftCmd)
metadata.Op = opType
metadata.K = dataPartitionPrefix + strconv.FormatUint(dp.VolID, 10) + keySeparator + strconv.FormatUint(dp.PartitionID, 10)
dpv := newDataPartitionValue(dp)
metadata.V, err = json.Marshal(dpv)
if err != nil {
return
}
return
}
func (c *Cluster) putDataPartitionInfo(opType uint32, dp *DataPartition) (err error) {
metadata, err := c.buildDataPartitionRaftCmd(opType, dp)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) submit(metadata *RaftCmd) (err error) {
cmd, err := metadata.Marshal()
if err != nil {
return errors.New(err.Error())
}
if _, err = c.partition.Submit(cmd); err != nil {
msg := fmt.Sprintf("action[metadata_submit] err:%v", err.Error())
return errors.New(msg)
}
return
}
// key=#vol#volID,value=json.Marshal(vv)
func (c *Cluster) syncAddVol(vol *Vol) (err error) {
return c.syncPutVolInfo(opSyncAddVol, vol)
}
func (c *Cluster) syncUpdateVol(vol *Vol) (err error) {
return c.syncPutVolInfo(opSyncUpdateVol, vol)
}
func (c *Cluster) syncDeleteVol(vol *Vol) (err error) {
return c.syncPutVolInfo(opSyncDeleteVol, vol)
}
func (c *Cluster) sycnPutZoneInfo(zone *Zone) error {
var err error
metadata := new(RaftCmd)
metadata.Op = opSyncUpdateZone
metadata.K = zonePrefix + zone.name
vv := zone.getFsmValue()
if vv.Name == "" {
vv.Name = DefaultZoneName
}
log.LogInfof("action[sycnPutZoneInfo] zone name %v", vv.Name)
if metadata.V, err = json.Marshal(vv); err != nil {
return errors.New(err.Error())
}
return c.submit(metadata)
}
func (c *Cluster) buildVolInfoRaftCmd(opType uint32, vol *Vol) (metadata *RaftCmd, err error) {
metadata = new(RaftCmd)
metadata.Op = opType
metadata.K = volPrefix + strconv.FormatUint(vol.ID, 10)
vv := newVolValue(vol)
if metadata.V, err = json.Marshal(vv); err != nil {
return nil, errors.New(err.Error())
}
return
}
func (c *Cluster) syncPutVolInfo(opType uint32, vol *Vol) (err error) {
metadata, err := c.buildVolInfoRaftCmd(opType, vol)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) syncAclList(vol *Vol, val []byte) (err error) {
log.LogDebugf("syncAclList vol %v vallen %v", vol.Name, len(val))
metadata := new(RaftCmd)
metadata.Op = opSyncAcl
metadata.K = AclPrefix + strconv.FormatUint(vol.ID, 10)
metadata.V = val
return c.submit(metadata)
}
func (c *Cluster) syncMultiVersion(vol *Vol, val []byte) (err error) {
metadata := new(RaftCmd)
metadata.Op = opSyncMulitVersion
metadata.K = MultiVerPrefix + strconv.FormatUint(vol.ID, 10)
metadata.V = val
if c == nil {
log.LogErrorf("syncMultiVersion c is nil")
return fmt.Errorf("vol %v but cluster is nil", vol.Name)
}
return c.submit(metadata)
}
func (c *Cluster) loadAclList(vol *Vol) (err error) {
key := AclPrefix + strconv.FormatUint(vol.ID, 10)
result, err := c.fsm.store.SeekForPrefix([]byte(key))
if err != nil {
log.LogErrorf("action[loadAclList] err %v", err)
return
}
log.LogDebugf("loadAclList vol %v rocksdb value count %v", vol.Name, len(result))
vol.aclMgr.init(c, vol)
for _, value := range result {
return vol.aclMgr.load(c, value)
}
return
}
func (c *Cluster) syncUidSpaceList(vol *Vol, val []byte) (err error) {
log.LogDebugf("syncUidSpaceList vol %v vallen %v", vol.Name, len(val))
metadata := new(RaftCmd)
metadata.Op = opSyncUid
metadata.K = UidPrefix + strconv.FormatUint(vol.ID, 10)
metadata.V = val
return c.submit(metadata)
}
func (c *Cluster) loadUidSpaceList(vol *Vol) (err error) {
key := UidPrefix + strconv.FormatUint(vol.ID, 10)
result, err := c.fsm.store.SeekForPrefix([]byte(key))
if err != nil {
log.LogErrorf("action[loadUidSpaceList] err %v", err)
return
}
log.LogDebugf("loadUidSpaceList vol %v rocksdb value count %v", vol.Name, len(result))
vol.initUidSpaceManager(c)
for _, value := range result {
return vol.uidSpaceManager.load(c, value)
}
return
}
func (c *Cluster) loadMultiVersion(vol *Vol) (err error) {
key := MultiVerPrefix + strconv.FormatUint(vol.ID, 10)
result, err := c.fsm.store.SeekForPrefix([]byte(key))
if err != nil {
log.LogErrorf("action[loadMultiVersion] err %v", err)
return
}
if len(result) == 0 {
log.LogWarnf("action[loadMultiVersion] MultiVersion zero and do init")
return vol.VersionMgr.init(c)
}
vol.VersionMgr.c = c
log.LogWarnf("action[loadMultiVersion] vol %v loadMultiVersion set cluster %v vol.VersionMgr %v", vol.Name, c, vol.VersionMgr)
for _, value := range result {
if err = vol.VersionMgr.loadMultiVersion(c, value); err != nil {
log.LogErrorf("action[loadMultiVersion] vol %v err %v", vol.Name, err)
return
}
log.LogWarnf("action[loadMultiVersion] vol %v MultiVersion zero and do init, verlist %v", vol.Name, vol.VersionMgr)
}
return
}
// key=#mp#volID#metaPartitionID,value=json.Marshal(metaPartitionValue)
func (c *Cluster) syncAddMetaPartition(mp *MetaPartition) (err error) {
return c.putMetaPartitionInfo(opSyncAddMetaPartition, mp)
}
func (c *Cluster) syncUpdateMetaPartition(mp *MetaPartition) (err error) {
return c.putMetaPartitionInfo(opSyncUpdateMetaPartition, mp)
}
func (c *Cluster) syncDeleteMetaPartition(mp *MetaPartition) (err error) {
return c.putMetaPartitionInfo(opSyncDeleteMetaPartition, mp)
}
func (c *Cluster) putMetaPartitionInfo(opType uint32, mp *MetaPartition) (err error) {
metadata, err := c.buildMetaPartitionRaftCmd(opType, mp)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) buildMetaPartitionRaftCmd(opType uint32, mp *MetaPartition) (metadata *RaftCmd, err error) {
metadata = new(RaftCmd)
metadata.Op = opType
partitionID := strconv.FormatUint(mp.PartitionID, 10)
metadata.K = metaPartitionPrefix + strconv.FormatUint(mp.volID, 10) + keySeparator + partitionID
mpv := newMetaPartitionValue(mp)
if metadata.V, err = json.Marshal(mpv); err != nil {
return metadata, errors.New(err.Error())
}
return
}
func (c *Cluster) syncBatchCommitCmd(cmdMap map[string]*RaftCmd) (err error) {
value, err := json.Marshal(cmdMap)
if err != nil {
return
}
cmd := &RaftCmd{
Op: opSyncBatchPut,
K: "batch_put",
V: value,
}
return c.submit(cmd)
}
// key=#mn#id#addr,value = nil
func (c *Cluster) syncAddMetaNode(metaNode *MetaNode) (err error) {
return c.syncPutMetaNode(opSyncAddMetaNode, metaNode)
}
func (c *Cluster) syncDeleteMetaNode(metaNode *MetaNode) (err error) {
return c.syncPutMetaNode(opSyncDeleteMetaNode, metaNode)
}
func (c *Cluster) syncUpdateMetaNode(metaNode *MetaNode) (err error) {
return c.syncPutMetaNode(opSyncUpdateMetaNode, metaNode)
}
func (c *Cluster) buildPutMetaNodeCmd(opType uint32, metaNode *MetaNode) (metadata *RaftCmd, err error) {
metadata = new(RaftCmd)
metadata.Op = opType
metadata.K = metaNodePrefix + strconv.FormatUint(metaNode.ID, 10) + keySeparator + metaNode.Addr
mnv := newMetaNodeValue(metaNode)
metadata.V, err = json.Marshal(mnv)
return
}
func (c *Cluster) buildAddMetaNodeCmd(metaNode *MetaNode) (metadata *RaftCmd, err error) {
metadata, err = c.buildPutMetaNodeCmd(opSyncAddMetaNode, metaNode)
return
}
func (c *Cluster) buildDeleteMetaNodeCmd(metaNode *MetaNode) (metadata *RaftCmd, err error) {
metadata, err = c.buildPutMetaNodeCmd(opSyncDeleteMetaNode, metaNode)
return
}
func (c *Cluster) buildUpdateMetaNodeCmd(metaNode *MetaNode) (metadata *RaftCmd, err error) {
metadata, err = c.buildPutMetaNodeCmd(opSyncUpdateMetaNode, metaNode)
return
}
func (c *Cluster) syncPutMetaNode(opType uint32, metaNode *MetaNode) (err error) {
metadata, err := c.buildPutMetaNodeCmd(opType, metaNode)
if err != nil {
return errors.New(err.Error())
}
return c.submit(metadata)
}
// key=#dn#id#Addr,value = json.Marshal(dnv)
func (c *Cluster) syncAddDataNode(dataNode *DataNode) (err error) {
return c.syncPutDataNode(opSyncAddDataNode, dataNode)
}
func (c *Cluster) syncDeleteDataNode(dataNode *DataNode) (err error) {
return c.syncPutDataNode(opSyncDeleteDataNode, dataNode)
}
func (c *Cluster) syncUpdateDataNode(dataNode *DataNode) (err error) {
return c.syncPutDataNode(opSyncUpdateDataNode, dataNode)
}
func (c *Cluster) buildAddDataNodeCmd(dataNode *DataNode) (metadata *RaftCmd, err error) {
metadata, err = c.buildPutDataNodeCmd(opSyncAddDataNode, dataNode)
return
}
func (c *Cluster) buildDeleteDataNodeCmd(dataNode *DataNode) (metadata *RaftCmd, err error) {
metadata, err = c.buildPutDataNodeCmd(opSyncDeleteDataNode, dataNode)
return
}
func (c *Cluster) buildUpdateDataNodeCmd(dataNode *DataNode) (metadata *RaftCmd, err error) {
metadata, err = c.buildPutDataNodeCmd(opSyncUpdateDataNode, dataNode)
return
}
func (c *Cluster) buildPutDataNodeCmd(opType uint32, dataNode *DataNode) (metadata *RaftCmd, err error) {
metadata = new(RaftCmd)
metadata.Op = opType
metadata.K = dataNodePrefix + strconv.FormatUint(dataNode.ID, 10) + keySeparator + dataNode.Addr
dnv := newDataNodeValue(dataNode)
metadata.V, err = json.Marshal(dnv)
if err != nil {
return
}
return
}
func (c *Cluster) syncPutDataNode(opType uint32, dataNode *DataNode) (err error) {
metadata, err := c.buildPutDataNodeCmd(opType, dataNode)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) addRaftNode(nodeID uint64, addr string) (err error) {
log.LogInfof("action[addRaftNode] nodeID: %v, addr: %v:", nodeID, addr)
peer := proto.Peer{ID: nodeID}
_, err = c.partition.ChangeMember(proto.ConfAddNode, peer, []byte(addr))
if err != nil {
return errors.New("action[addRaftNode] error: " + err.Error())
}
return nil
}
func (c *Cluster) removeRaftNode(nodeID uint64, addr string) (err error) {
log.LogInfof("action[removeRaftNode] nodeID: %v, addr: %v:", nodeID, addr)
peer := proto.Peer{ID: nodeID}
_, err = c.partition.ChangeMember(proto.ConfRemoveNode, peer, []byte(addr))
if err != nil {
return errors.New("action[removeRaftNode] error: " + err.Error())
}
return nil
}
func (c *Cluster) updateDirChildrenNumLimit(val uint32) {
if val < bsProto.MinDirChildrenNumLimit {
val = bsProto.DefaultDirChildrenNumLimit
}
atomic.StoreUint32(&c.cfg.DirChildrenNumLimit, val)
}
func (c *Cluster) updateMetaNodeDeleteBatchCount(val uint64) {
atomic.StoreUint64(&c.cfg.MetaNodeDeleteBatchCount, val)
}
func (c *Cluster) updateMetaNodeDeleteWorkerSleepMs(val uint64) {
atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, val)
}
func (c *Cluster) updateDataPartitionMaxRepairErrCnt(val uint64) {
atomic.StoreUint64(&c.cfg.DpMaxRepairErrCnt, val)
}
func (c *Cluster) updateDataPartitionRepairTimeOut(val uint64) {
atomic.StoreUint64(&c.cfg.DpRepairTimeOut, val)
}
func (c *Cluster) updateDataNodeAutoRepairLimit(val uint64) {
atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, val)
}
func (c *Cluster) updateDataNodeDeleteLimitRate(val uint64) {
atomic.StoreUint64(&c.cfg.DataNodeDeleteLimitRate, val)
}
func (c *Cluster) updateMaxDpCntLimit(val uint64) {
atomic.StoreUint64(&c.cfg.MaxDpCntLimit, val)
}
func (c *Cluster) updateInodeIdStep(val uint64) {
atomic.StoreUint64(&c.cfg.MetaPartitionInodeIdStep, val)
}
func (c *Cluster) loadZoneValue() (err error) {
var ok bool
result, err := c.fsm.store.SeekForPrefix([]byte(zonePrefix))
if err != nil {
err = fmt.Errorf("action[loadZoneValue],err:%v", err.Error())
return err
}
for _, value := range result {
cv := &zoneValue{}
if err = json.Unmarshal(value, cv); err != nil {
log.LogErrorf("action[loadZoneValue], unmarshal err:%v", err.Error())
continue
}
var zoneInfo interface{}
if zoneInfo, ok = c.t.zoneMap.Load(cv.Name); !ok {
log.LogErrorf("action[loadZoneValue], zonename [%v] not found", cv.Name)
continue
}
zone := zoneInfo.(*Zone)
zone.QosFlowRLimit = cv.QosFlowRLimit
zone.QosIopsWLimit = cv.QosIopsWLimit
zone.QosFlowWLimit = cv.QosFlowWLimit
zone.QosIopsRLimit = cv.QosIopsRLimit
if zone.GetDataNodesetSelector() != cv.DataNodesetSelector {
zone.dataNodesetSelector = NewNodesetSelector(cv.DataNodesetSelector, DataNodeType)
}
if zone.GetMetaNodesetSelector() != cv.MetaNodesetSelector {
zone.metaNodesetSelector = NewNodesetSelector(cv.MetaNodesetSelector, MetaNodeType)
}
log.LogInfof("action[loadZoneValue] load zonename[%v] with limit [%v,%v,%v,%v]",
zone.name, cv.QosFlowRLimit, cv.QosIopsWLimit, cv.QosFlowWLimit, cv.QosIopsRLimit)
zone.loadDataNodeQosLimit()
}
return
}
func (c *Cluster) updateMaxConcurrentLcNodes(val uint64) {
atomic.StoreUint64(&c.cfg.MaxConcurrentLcNodes, val)
}
// persist cluster value if not persisted; set create time for cluster being created.
func (c *Cluster) checkPersistClusterValue() {
result, err := c.fsm.store.SeekForPrefix([]byte(clusterPrefix))
if err != nil {
err = fmt.Errorf("action[checkPersistClusterValue] seek cluster value err: %v", err.Error())
panic(err)
}
if len(result) != 0 {
log.LogInfo("action[checkPersistClusterValue] already has cluster value record, need to do nothing")
return
}
/* when cluster value not persisted, it could be:
- cluster created by old version master which may not persist cluster value, not need set create time;
- cluster being created, need to set create time;
check whether persisted node set info to determine which scenario it is. */
result, err = c.fsm.store.SeekForPrefix([]byte(nodeSetPrefix))
if err != nil {
err = fmt.Errorf("action[checkPersistClusterValue] seek node set err: %v", err.Error())
panic(err)
}
oldVal := c.CreateTime
var scenarioMsg string
if len(result) != 0 {
scenarioMsg = "cluster already created"
} else {
scenarioMsg = "cluster being created"
c.CreateTime = time.Now().Unix()
}
log.LogInfo("action[checkPersistClusterValue] to add cluster value record for " + scenarioMsg)
if err = c.syncPutCluster(); err != nil {
c.CreateTime = oldVal
log.LogErrorf("action[checkPersistClusterValue] put err[%v]", err.Error())
panic(err)
}
log.LogInfo("action[checkPersistClusterValue] add cluster value record")
return
}
func (c *Cluster) loadClusterValue() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(clusterPrefix))
if err != nil {
err = fmt.Errorf("action[loadClusterValue],err:%v", err.Error())
return err
}
for _, value := range result {
cv := &clusterValue{}
if err = json.Unmarshal(value, cv); err != nil {
log.LogErrorf("action[loadClusterValue], unmarshal err:%v", err.Error())
return err
}
if cv.Name != c.Name {
log.LogErrorf("action[loadClusterValue] loaded cluster value: %+v", cv)
continue
}
log.LogDebugf("action[loadClusterValue] loaded cluster value: %+v", cv)
c.CreateTime = cv.CreateTime
if cv.MaxConcurrentLcNodes == 0 {
cv.MaxConcurrentLcNodes = defaultMaxConcurrentLcNodes
}
c.cfg.MetaNodeThreshold = cv.Threshold
// c.cfg.DirChildrenNumLimit = cv.DirChildrenNumLimit
c.cfg.ClusterLoadFactor = cv.LoadFactor
c.DisableAutoAllocate = cv.DisableAutoAllocate
c.ForbidMpDecommission = cv.ForbidMpDecommission
c.diskQosEnable = cv.DiskQosEnable
c.cfg.QosMasterAcceptLimit = cv.QosLimitUpload
c.DecommissionLimit = cv.DecommissionLimit // dont update nodesets limit for nodesets are not loaded
c.fileStatsEnable = cv.FileStatsEnable
c.clusterUuid = cv.ClusterUuid
c.clusterUuidEnable = cv.ClusterUuidEnable
c.DecommissionLimit = cv.DecommissionLimit
c.EnableAutoDecommissionDisk = cv.EnableAutoDecommissionDisk
c.DecommissionDiskFactor = cv.DecommissionDiskFactor
if c.cfg.QosMasterAcceptLimit < QosMasterAcceptCnt {
c.cfg.QosMasterAcceptLimit = QosMasterAcceptCnt
}
c.QosAcceptLimit.SetLimit(rate.Limit(c.cfg.QosMasterAcceptLimit))
log.LogInfof("action[loadClusterValue] qos limit %v", c.cfg.QosMasterAcceptLimit)
c.updateDirChildrenNumLimit(cv.DirChildrenNumLimit)
c.updateMetaNodeDeleteBatchCount(cv.MetaNodeDeleteBatchCount)
c.updateMetaNodeDeleteWorkerSleepMs(cv.MetaNodeDeleteWorkerSleepMs)
c.updateDataNodeDeleteLimitRate(cv.DataNodeDeleteLimitRate)
c.updateDataNodeAutoRepairLimit(cv.DataNodeAutoRepairLimitRate)
c.updateDataPartitionMaxRepairErrCnt(cv.DpMaxRepairErrCnt)
c.updateDataPartitionRepairTimeOut(cv.DpRepairTimeOut)
c.updateMaxDpCntLimit(cv.MaxDpCntLimit)
if cv.MetaPartitionInodeIdStep == 0 {
cv.MetaPartitionInodeIdStep = defaultMetaPartitionInodeIDStep
}
c.updateInodeIdStep(cv.MetaPartitionInodeIdStep)
c.updateMaxConcurrentLcNodes(cv.MaxConcurrentLcNodes)
log.LogInfof("action[loadClusterValue], metaNodeThreshold[%v]", cv.Threshold)
c.checkDataReplicasEnable = cv.CheckDataReplicasEnable
}
return
}
func (c *Cluster) loadNodeSets() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(nodeSetPrefix))
if err != nil {
err = fmt.Errorf("action[loadNodeSets],err:%v", err.Error())
return err
}
for _, value := range result {
nsv := &nodeSetValue{}
if err = json.Unmarshal(value, nsv); err != nil {
log.LogErrorf("action[loadNodeSets], unmarshal err:%v", err.Error())
return err
}
if nsv.ZoneName == "" {
nsv.ZoneName = DefaultZoneName
}
cap := nsv.Capacity
if cap < 3 {
cap = c.cfg.nodeSetCapacity
}
ns := newNodeSet(c, nsv.ID, cap, nsv.ZoneName)
ns.UpdateMaxParallel(int32(c.DecommissionLimit))
ns.UpdateDecommissionDiskFactor(c.DecommissionDiskFactor)
if nsv.DataNodeSelector != "" && ns.GetDataNodeSelector() != nsv.DataNodeSelector {
ns.SetDataNodeSelector(nsv.DataNodeSelector)
}
if nsv.MetaNodeSelector != "" && ns.GetMetaNodeSelector() != nsv.MetaNodeSelector {
ns.SetMetaNodeSelector(nsv.MetaNodeSelector)
}
zone, err := c.t.getZone(nsv.ZoneName)
if err != nil {
log.LogErrorf("action[loadNodeSets], getZone err:%v", err)
zone = newZone(nsv.ZoneName)
c.t.putZoneIfAbsent(zone)
}
zone.putNodeSet(ns)
log.LogInfof("action[addNodeSetGrp] nodeSet[%v]", ns.ID)
if err = c.addNodeSetGrp(ns, true); err != nil {
log.LogErrorf("action[createNodeSet] nodeSet[%v] err[%v]", ns.ID, err)
return err
}
log.LogInfof("action[loadNodeSets], nsId[%v],zone[%v]", ns.ID, zone.name)
}
return nil
}
// put exclude zone only be used one time when master update and restart
func (c *Cluster) putZoneDomain(init bool) (err error) {
log.LogInfof("action[putZoneDomain]")
metadata := new(RaftCmd)
metadata.Op = opSyncExclueDomain
metadata.K = DomainPrefix
c.domainManager.RLock()
defer c.domainManager.RUnlock()
if init {
for i := 0; i < len(c.t.zones); i++ {
c.domainManager.excludeZoneListDomain[c.t.zones[i].name] = 0
c.t.domainExcludeZones = append(c.t.domainExcludeZones, c.t.zones[i].name)
}
if len(c.t.zones) == 0 {
c.needFaultDomain = true
}
}
domainValue := newZoneDomainValue()
domainValue.ExcludeZoneMap = c.domainManager.excludeZoneListDomain
domainValue.NeedFaultDomain = c.needFaultDomain
domainValue.domainNodeSetGrpVec = c.domainManager.domainNodeSetGrpVec
domainValue.DomainZoneName2IdMap = c.domainManager.ZoneName2DomainIdMap
if c.domainManager.dataRatioLimit > 0 {
log.LogInfof("action[putZoneDomain] ratio %v", c.domainManager.dataRatioLimit)
domainValue.DataRatio = c.domainManager.dataRatioLimit
} else {
domainValue.DataRatio = defaultDomainUsageThreshold
}
if c.domainManager.excludeZoneUseRatio > 0 && c.domainManager.excludeZoneUseRatio <= 1 {
domainValue.ExcludeZoneUseRatio = c.domainManager.excludeZoneUseRatio
} else {
domainValue.ExcludeZoneUseRatio = defaultDomainUsageThreshold
}
metadata.V, err = json.Marshal(domainValue)
if err != nil {
return
}
return c.submit(metadata)
}
func (c *Cluster) loadZoneDomain() (ok bool, err error) {
log.LogInfof("action[loadZoneDomain]")
result, err := c.fsm.store.SeekForPrefix([]byte(DomainPrefix))
if err != nil {
err = fmt.Errorf("action[loadZoneDomain],err:%v", err.Error())
log.LogInfof("action[loadZoneDomain] err[%v]", err)
return false, err
}
if len(result) == 0 {
err = fmt.Errorf("action[loadZoneDomain],err:not found")
log.LogInfof("action[loadZoneDomain] err[%v]", err)
return false, nil
}
for _, value := range result {
nsv := &zoneDomainValue{}
if err = json.Unmarshal(value, nsv); err != nil {
log.LogErrorf("action[loadNodeSets], unmarshal err:%v", err.Error())
return true, err
}
log.LogInfof("action[loadZoneDomain] get value!exclue map[%v],need domain[%v] ratio [%v]", nsv.ExcludeZoneMap, nsv.NeedFaultDomain, nsv.DataRatio)
c.domainManager.excludeZoneListDomain = nsv.ExcludeZoneMap
for zoneName := range nsv.ExcludeZoneMap {
c.t.domainExcludeZones = append(c.t.domainExcludeZones, zoneName)
}
c.needFaultDomain = nsv.NeedFaultDomain
c.domainManager.dataRatioLimit = nsv.DataRatio
c.domainManager.ZoneName2DomainIdMap = nsv.DomainZoneName2IdMap
c.domainManager.excludeZoneUseRatio = nsv.ExcludeZoneUseRatio
for zoneName, domainId := range c.domainManager.ZoneName2DomainIdMap {
log.LogInfof("action[loadZoneDomain] zoneName %v domainid %v", zoneName, domainId)
if domainIndex, ok := c.domainManager.domainId2IndexMap[domainId]; !ok {
log.LogInfof("action[loadZoneDomain] zoneName %v domainid %v build new domainnodesetgrp manager", zoneName, domainId)
domainGrp := newDomainNodeSetGrpManager()
domainGrp.domainId = domainId
c.domainManager.domainNodeSetGrpVec = append(c.domainManager.domainNodeSetGrpVec, domainGrp)
domainIndex = len(c.domainManager.domainNodeSetGrpVec) - 1
c.domainManager.domainId2IndexMap[domainId] = domainIndex
}
}
break
}
log.LogInfof("action[loadZoneDomain] success!")
return true, nil
}
func (c *Cluster) loadNodeSetGrps() (err error) {
log.LogInfof("action[loadNodeSetGrps]")
result, err := c.fsm.store.SeekForPrefix([]byte(nodeSetGrpPrefix))
if err != nil {
err = fmt.Errorf("action[loadNodeSets],err:%v", err.Error())
log.LogInfof("action[loadNodeSetGrps] seek failed, nsgId[%v]", err)
return err
}
if len(result) > 0 {
log.LogInfof("action[loadNodeSetGrps] get result len[%v]", len(result))
c.domainManager.start()
}
log.LogInfof("action[loadNodeSetGrps] get result len[%v] before decode", len(result))
for _, value := range result {
domainInfoLoad := &domainNodeSetGrpValue{}
if err = json.Unmarshal(value, domainInfoLoad); err != nil {
log.LogFatalf("action[loadNodeSets], unmarshal err:%v", err.Error())
return err
}
log.LogInfof("action[loadNodeSetGrps] get result domainid [%v] domainInfoLoad id[%v],status[%v],ids[%v]",
domainInfoLoad.DomainId, domainInfoLoad.ID, domainInfoLoad.Status, domainInfoLoad.NodeSetsIds)
nsg := newNodeSetGrp(c)
nsg.nodeSetsIds = domainInfoLoad.NodeSetsIds
nsg.ID = domainInfoLoad.ID
nsg.status = domainInfoLoad.Status
nsg.domainId = domainInfoLoad.DomainId
domainId := domainInfoLoad.DomainId
var domainIndex int
var ok bool
var domainGrp *DomainNodeSetGrpManager
if domainIndex, ok = c.domainManager.domainId2IndexMap[domainId]; !ok {
domainGrp = newDomainNodeSetGrpManager()
domainGrp.domainId = domainId
c.domainManager.domainNodeSetGrpVec = append(c.domainManager.domainNodeSetGrpVec, domainGrp)
domainIndex = len(c.domainManager.domainNodeSetGrpVec) - 1
c.domainManager.domainId2IndexMap[domainId] = domainIndex
}
domainGrp = c.domainManager.domainNodeSetGrpVec[domainIndex]
domainGrp.nodeSetGrpMap = append(domainGrp.nodeSetGrpMap, nsg)
var j int
for j = 0; j < len(domainInfoLoad.NodeSetsIds); j++ {
domainGrp.nsId2NsGrpMap[domainInfoLoad.NodeSetsIds[j]] = len(domainGrp.nodeSetGrpMap) - 1
log.LogInfof("action[loadNodeSetGrps] get result index[%v] nodesetid[%v] nodesetgrp index [%v]",
domainInfoLoad.ID, domainInfoLoad.NodeSetsIds[j], domainInfoLoad.Status)
}
log.LogInfof("action[loadNodeSetGrps], nsgId[%v],status[%v]", nsg.ID, nsg.status)
}
return
}
func (c *Cluster) loadDataNodes() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(dataNodePrefix))
if err != nil {
err = fmt.Errorf("action[loadDataNodes],err:%v", err.Error())
return err
}
for _, value := range result {
dnv := &dataNodeValue{}
if err = json.Unmarshal(value, dnv); err != nil {
err = fmt.Errorf("action[loadDataNodes],value:%v,unmarshal err:%v", string(value), err)
return
}
if dnv.ZoneName == "" {
dnv.ZoneName = DefaultZoneName
}
dataNode := newDataNode(dnv.Addr, dnv.ZoneName, c.Name)
dataNode.DpCntLimit = newDpCountLimiter(&c.cfg.MaxDpCntLimit)
dataNode.ID = dnv.ID
dataNode.NodeSetID = dnv.NodeSetID
dataNode.RdOnly = dnv.RdOnly
for _, disk := range dnv.DecommissionedDisks {
dataNode.addDecommissionedDisk(disk)
}
dataNode.DecommissionStatus = dnv.DecommissionStatus
dataNode.DecommissionDstAddr = dnv.DecommissionDstAddr
dataNode.DecommissionRaftForce = dnv.DecommissionRaftForce
dataNode.DecommissionLimit = dnv.DecommissionLimit
dataNode.DecommissionRetry = dnv.DecommissionRetry
dataNode.DecommissionCompleteTime = dnv.DecommissionCompleteTime
dataNode.ToBeOffline = dnv.ToBeOffline
dataNode.DecommissionDiskList = dnv.DecommissionDiskList
dataNode.DecommissionDpTotal = dnv.DecommissionDpTotal
olddn, ok := c.dataNodes.Load(dataNode.Addr)
if ok {
if olddn.(*DataNode).ID <= dataNode.ID {
log.LogDebugf("action[loadDataNodes]: skip addr %v old %v current %v", dataNode.Addr, olddn.(*DataNode).ID, dataNode.ID)
continue
}
}
c.dataNodes.Store(dataNode.Addr, dataNode)
log.LogInfof("action[loadDataNodes],dataNode[%v],dataNodeID[%v],zone[%v],ns[%v]", dataNode.Addr, dataNode.ID, dnv.ZoneName, dnv.NodeSetID)
}
return
}
func (c *Cluster) loadMetaNodes() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(metaNodePrefix))
if err != nil {
err = fmt.Errorf("action[loadMetaNodes],err:%v", err.Error())
return err
}
for _, value := range result {
mnv := &metaNodeValue{}
if err = json.Unmarshal(value, mnv); err != nil {
err = fmt.Errorf("action[loadMetaNodes],unmarshal err:%v", err.Error())
return err
}
if mnv.ZoneName == "" {
mnv.ZoneName = DefaultZoneName
}
metaNode := newMetaNode(mnv.Addr, mnv.ZoneName, c.Name)
metaNode.ID = mnv.ID
metaNode.NodeSetID = mnv.NodeSetID
metaNode.RdOnly = mnv.RdOnly
oldmn, ok := c.metaNodes.Load(metaNode.Addr)
if ok {
if oldmn.(*MetaNode).ID <= metaNode.ID {
continue
}
}
c.metaNodes.Store(metaNode.Addr, metaNode)
log.LogInfof("action[loadMetaNodes],metaNode[%v], metaNodeID[%v],zone[%v],ns[%v]", metaNode.Addr, metaNode.ID, mnv.ZoneName, mnv.NodeSetID)
}
return
}
func (c *Cluster) loadVolsViews() (err error, volViews []*volValue) {
result, err := c.fsm.store.SeekForPrefix([]byte(volPrefix))
if err != nil {
err = fmt.Errorf("action[loadVols],err:%v", err.Error())
return
}
for _, value := range result {
var vv *volValue
if vv, err = newVolValueFromBytes(value); err != nil {
err = fmt.Errorf("action[loadVols],value:%v,unmarshal err:%v", string(value), err)
return
}
volViews = append(volViews, vv)
log.LogInfof("action[loadVols],vol[%v]", vv.Name)
}
return
}
func (c *Cluster) loadVols() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(volPrefix))
if err != nil {
err = fmt.Errorf("action[loadVols],err:%v", err.Error())
return err
}
for _, value := range result {
var vv *volValue
if vv, err = newVolValueFromBytes(value); err != nil {
err = fmt.Errorf("action[loadVols],value:%v,unmarshal err:%v", string(value), err)
return err
}
vol := newVolFromVolValue(vv)
vol.Status = vv.Status
if err = c.loadAclList(vol); err != nil {
log.LogInfof("action[loadVols],vol[%v] load acl manager error %v", vol.Name, err)
continue
}
if err = c.loadUidSpaceList(vol); err != nil {
log.LogInfof("action[loadVols],vol[%v] load uid manager error %v", vol.Name, err)
continue
}
if err = c.loadMultiVersion(vol); err != nil {
log.LogInfof("action[loadVols],vol[%v] load ver manager error %v c %v", vol.Name, err, c)
continue
}
c.putVol(vol)
log.LogInfof("action[loadVols],vol[%v]", vol.Name)
}
return
}
func (c *Cluster) loadMetaPartitions() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(metaPartitionPrefix))
if err != nil {
err = fmt.Errorf("action[loadMetaPartitions],err:%v", err.Error())
return err
}
for _, value := range result {
mpv := &metaPartitionValue{}
if err = json.Unmarshal(value, mpv); err != nil {
err = fmt.Errorf("action[loadMetaPartitions],value:%v,unmarshal err:%v", string(value), err)
return err
}
vol, err1 := c.getVol(mpv.VolName)
if err1 != nil {
log.LogErrorf("action[loadMetaPartitions] err:%v", err1.Error())
continue
}
if vol.ID != mpv.VolID {
Warn(c.Name, fmt.Sprintf("action[loadMetaPartitions] has duplicate vol[%v],vol.gridId[%v],mpv.VolID[%v]", mpv.VolName, vol.ID, mpv.VolID))
continue
}
for i := 0; i < len(mpv.Peers); i++ {
mn, ok := c.metaNodes.Load(mpv.Peers[i].Addr)
if ok && mn.(*MetaNode).ID != mpv.Peers[i].ID {
mpv.Peers[i].ID = mn.(*MetaNode).ID
}
}
mp := newMetaPartition(mpv.PartitionID, mpv.Start, mpv.End, vol.mpReplicaNum, vol.Name, mpv.VolID, 0)
mp.setHosts(strings.Split(mpv.Hosts, underlineSeparator))
mp.setPeers(mpv.Peers)
mp.OfflinePeerID = mpv.OfflinePeerID
mp.IsRecover = mpv.IsRecover
vol.addMetaPartition(mp)
c.addBadMetaParitionIdMap(mp)
log.LogInfof("action[loadMetaPartitions],vol[%v],mp[%v]", vol.Name, mp.PartitionID)
}
return
}
func (c *Cluster) addBadMetaParitionIdMap(mp *MetaPartition) {
if !mp.IsRecover {
return
}
c.putBadMetaPartitions(mp.Hosts[0], mp.PartitionID)
}
func (c *Cluster) loadDataPartitions() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(dataPartitionPrefix))
if err != nil {
err = fmt.Errorf("action[loadDataPartitions],err:%v", err.Error())
return err
}
for _, value := range result {
dpv := &dataPartitionValue{}
if err = json.Unmarshal(value, dpv); err != nil {
err = fmt.Errorf("action[loadDataPartitions],value:%v,unmarshal err:%v", string(value), err)
return err
}
vol, err1 := c.getVol(dpv.VolName)
if err1 != nil {
log.LogErrorf("action[loadDataPartitions] err:%v %v", dpv.VolName, err1.Error())
continue
}
if vol.ID != dpv.VolID {
Warn(c.Name, fmt.Sprintf("action[loadDataPartitions] has duplicate vol[%v],vol.gridId[%v],mpv.VolID[%v]", dpv.VolName, vol.ID, dpv.VolID))
continue
}
dp := dpv.Restore(c)
vol.dataPartitions.put(dp)
c.addBadDataPartitionIdMap(dp)
// add to nodeset decommission list
go dp.addToDecommissionList(c)
log.LogInfof("action[loadDataPartitions],vol[%v],dp[%v] ", vol.Name, dp.PartitionID)
}
return
}
func (c *Cluster) loadQuota() (err error) {
c.volMutex.RLock()
defer c.volMutex.RUnlock()
for name, vol := range c.vols {
if err = vol.loadQuotaManager(c); err != nil {
log.LogErrorf("loadQuota loadQuotaManager vol [%v] fail err [%v]", name, err.Error())
return err
}
}
return
}
// load s3api qos info to memory cache
func (c *Cluster) loadS3ApiQosInfo() (err error) {
keyPrefix := S3QoSPrefix
result, err := c.fsm.store.SeekForPrefix([]byte(keyPrefix))
if err != nil {
err = fmt.Errorf("loadS3ApiQosInfo get failed, err [%v]", err)
return err
}
for key, value := range result {
s3qosQuota, err := strconv.ParseUint(string(value), 10, 64)
if err != nil {
return err
}
log.LogDebugf("loadS3ApiQosInfo key[%v] value[%v]", key, s3qosQuota)
c.S3ApiQosQuota.Store(key, s3qosQuota)
}
return
}
func (c *Cluster) addBadDataPartitionIdMap(dp *DataPartition) {
if !dp.IsDecommissionRunning() {
return
}
c.putBadDataPartitionIDsByDiskPath(dp.DecommissionSrcDiskPath, dp.DecommissionSrcAddr, dp.PartitionID)
}
func (c *Cluster) syncAddDecommissionDisk(disk *DecommissionDisk) (err error) {
return c.syncPutDecommissionDiskInfo(opSyncAddDecommissionDisk, disk)
}
func (c *Cluster) syncDeleteDecommissionDisk(disk *DecommissionDisk) (err error) {
return c.syncPutDecommissionDiskInfo(opSyncDeleteDecommissionDisk, disk)
}
func (c *Cluster) syncUpdateDecommissionDisk(disk *DecommissionDisk) (err error) {
return c.syncPutDecommissionDiskInfo(opSyncUpdateDecommissionDisk, disk)
}
func (c *Cluster) syncPutDecommissionDiskInfo(opType uint32, disk *DecommissionDisk) (err error) {
metadata := new(RaftCmd)
metadata.Op = opType
metadata.K = DecommissionDiskPrefix + disk.SrcAddr + keySeparator + disk.DiskPath
ddv := newDecommissionDiskValue(disk)
metadata.V, err = json.Marshal(ddv)
if err != nil {
return errors.New(err.Error())
}
return c.submit(metadata)
}
type decommissionDiskValue struct {
SrcAddr string
DstAddr string
DiskPath string
DecommissionStatus uint32
DecommissionRaftForce bool
DecommissionRetry uint8
DecommissionDpTotal int
DecommissionTerm uint64
Type uint32
DecommissionCompleteTime int64
DecommissionLimit int
}
func newDecommissionDiskValue(disk *DecommissionDisk) *decommissionDiskValue {
return &decommissionDiskValue{
SrcAddr: disk.SrcAddr,
DstAddr: disk.DstAddr,
DiskPath: disk.DiskPath,
DecommissionRetry: disk.DecommissionRetry,
DecommissionStatus: atomic.LoadUint32(&disk.DecommissionStatus),
DecommissionRaftForce: disk.DecommissionRaftForce,
DecommissionDpTotal: disk.DecommissionDpTotal,
DecommissionTerm: disk.DecommissionTerm,
Type: disk.Type,
DecommissionCompleteTime: disk.DecommissionCompleteTime,
DecommissionLimit: disk.DecommissionDpCount,
}
}
func (ddv *decommissionDiskValue) Restore() *DecommissionDisk {
return &DecommissionDisk{
SrcAddr: ddv.SrcAddr,
DstAddr: ddv.DstAddr,
DiskPath: ddv.DiskPath,
DecommissionRetry: ddv.DecommissionRetry,
DecommissionStatus: ddv.DecommissionStatus,
DecommissionRaftForce: ddv.DecommissionRaftForce,
DecommissionDpTotal: ddv.DecommissionDpTotal,
DecommissionTerm: ddv.DecommissionTerm,
Type: ddv.Type,
DecommissionCompleteTime: ddv.DecommissionCompleteTime,
DecommissionDpCount: ddv.DecommissionLimit,
}
}
func (c *Cluster) loadDecommissionDiskList() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(DecommissionDiskPrefix))
if err != nil {
err = fmt.Errorf("action[loadDataPartitions],err:%v", err.Error())
return err
}
for _, value := range result {
ddv := &decommissionDiskValue{}
if err = json.Unmarshal(value, ddv); err != nil {
err = fmt.Errorf("action[loadDecommissionDiskList],value:%v,unmarshal err:%v", string(value), err)
return err
}
dd := ddv.Restore()
c.DecommissionDisks.Store(dd.GenerateKey(), dd)
log.LogInfof("action[loadDecommissionDiskList],decommissionDisk[%v] type %v dst[%v] status[%v] raftForce[%v]"+
"dpTotal[%v] term[%v]",
dd.GenerateKey(), dd.Type, dd.DstAddr, dd.GetDecommissionStatus(), dd.DecommissionRaftForce,
dd.DecommissionDpTotal, dd.DecommissionTerm)
c.addDecommissionDiskToNodeset(dd)
}
return
}
func (c *Cluster) startDecommissionListTraverse() (err error) {
zones := c.t.getAllZones()
log.LogDebugf("startDecommissionListTraverse zones len %v", len(zones))
for _, zone := range zones {
log.LogDebugf("startDecommissionListTraverse zone %v ", zone.name)
err = zone.startDecommissionListTraverse(c)
if err != nil {
return
}
}
return
}
func (c *Cluster) syncAddLcNode(ln *LcNode) (err error) {
return c.syncPutLcNodeInfo(opSyncAddLcNode, ln)
}
func (c *Cluster) syncDeleteLcNode(ln *LcNode) (err error) {
return c.syncPutLcNodeInfo(opSyncDeleteLcNode, ln)
}
func (c *Cluster) syncUpdateLcNode(ln *LcNode) (err error) {
return c.syncPutLcNodeInfo(opSyncUpdateLcNode, ln)
}
func (c *Cluster) syncPutLcNodeInfo(opType uint32, ln *LcNode) (err error) {
metadata := new(RaftCmd)
metadata.Op = opType
metadata.K = lcNodePrefix + ln.Addr
lnv := newLcNodeValue(ln)
metadata.V, err = json.Marshal(lnv)
if err != nil {
return errors.New(err.Error())
}
return c.submit(metadata)
}
type lcNodeValue struct {
ID uint64
Addr string
}
func newLcNodeValue(lcNode *LcNode) *lcNodeValue {
return &lcNodeValue{
ID: lcNode.ID,
Addr: lcNode.Addr,
}
}
func (c *Cluster) loadLcNodes() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(lcNodePrefix))
if err != nil {
err = fmt.Errorf("action[loadLcNodes],err:%v", err.Error())
return err
}
log.LogInfof("action[loadLcNodes], result count %v", len(result))
for _, value := range result {
lnv := &lcNodeValue{}
if err = json.Unmarshal(value, lnv); err != nil {
err = fmt.Errorf("action[loadLcNodes],value:%v,unmarshal err:%v", string(value), err)
return
}
log.LogInfof("action[loadLcNodes], load lcNode[%v], lcNodeID[%v]", lnv.Addr, lnv.ID)
lcNode := newLcNode(lnv.Addr, c.Name)
lcNode.ID = lnv.ID
c.lcNodes.Store(lcNode.Addr, lcNode)
log.LogInfof("action[loadLcNodes], store lcNode[%v], lcNodeID[%v]", lcNode.Addr, lcNode.ID)
}
return
}
func (c *Cluster) syncAddLcConf(lcConf *bsProto.LcConfiguration) (err error) {
return c.syncPutLcConfInfo(opSyncAddLcConf, lcConf)
}
func (c *Cluster) syncDeleteLcConf(lcConf *bsProto.LcConfiguration) (err error) {
return c.syncPutLcConfInfo(opSyncDeleteLcConf, lcConf)
}
func (c *Cluster) syncUpdateLcConf(lcConf *bsProto.LcConfiguration) (err error) {
return c.syncPutLcConfInfo(opSyncUpdateLcConf, lcConf)
}
func (c *Cluster) syncPutLcConfInfo(opType uint32, lcConf *bsProto.LcConfiguration) (err error) {
metadata := new(RaftCmd)
metadata.Op = opType
metadata.K = lcConfPrefix + lcConf.VolName
metadata.V, err = json.Marshal(lcConf)
if err != nil {
return errors.New(err.Error())
}
return c.submit(metadata)
}
func (c *Cluster) loadLcConfs() (err error) {
result, err := c.fsm.store.SeekForPrefix([]byte(lcConfPrefix))
if err != nil {
err = fmt.Errorf("action[loadLcConfs],err:%v", err.Error())
return err
}
for _, value := range result {
lcConf := &bsProto.LcConfiguration{}
if err = json.Unmarshal(value, lcConf); err != nil {
err = fmt.Errorf("action[loadLcConfs],value:%v,unmarshal err:%v", string(value), err)
return
}
_ = c.lcMgr.SetS3BucketLifecycle(lcConf)
log.LogInfof("action[loadLcConfs],vol[%v]", lcConf.VolName)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"io"
"github.com/tecbot/gorocksdb"
)
// MetadataSnapshot represents the snapshot of a meta partition
type MetadataSnapshot struct {
fsm *MetadataFsm
applied uint64
snapshot *gorocksdb.Snapshot
iterator *gorocksdb.Iterator
}
// ApplyIndex implements the Snapshot interface
func (ms *MetadataSnapshot) ApplyIndex() uint64 {
return ms.applied
}
// Close implements the Snapshot interface
func (ms *MetadataSnapshot) Close() {
ms.fsm.store.ReleaseSnapshot(ms.snapshot)
}
// Next implements the Snapshot interface
func (ms *MetadataSnapshot) Next() (data []byte, err error) {
md := new(RaftCmd)
if ms.iterator.Valid() {
key := ms.iterator.Key()
md.K = string(key.Data())
md.setOpType()
value := ms.iterator.Value()
if value != nil {
md.V = value.Data()
}
if data, err = md.Marshal(); err != nil {
err = fmt.Errorf("action[Next],marshal kv:%v,err:%v", md, err.Error())
return nil, err
}
ms.iterator.Next()
return data, nil
}
return nil, io.EOF
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package mocktest
import (
"bytes"
"encoding/json"
"fmt"
"net"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
)
const (
defaultUsedSize = 20 * util.GB
)
type MockDataServer struct {
nodeID uint64
TcpAddr string
Zone string
ClusterID string
Total uint64
Used uint64
Available uint64
CreatedPartitionWeights uint64 // dataPartitionCnt*dataPartitionSize
RemainWeightsForCreatePartition uint64 // all-useddataPartitionsWieghts
CreatedPartitionCnt uint64
MaxWeightsForCreatePartition uint64
partitions []*MockDataPartition
zoneName string
mc *master.MasterClient
sync.RWMutex
}
func NewMockDataServer(addr string, zoneName string) *MockDataServer {
mds := &MockDataServer{
TcpAddr: addr,
zoneName: zoneName,
partitions: make([]*MockDataPartition, 0),
mc: master.NewMasterClient([]string{hostAddr}, false),
}
return mds
}
func (mds *MockDataServer) Start() {
mds.register()
go mds.start()
}
func (mds *MockDataServer) register() {
var err error
var nodeID uint64
var retry int
for retry < 3 {
nodeID, err = mds.mc.NodeAPI().AddDataNode(mds.TcpAddr, mds.zoneName)
if err == nil {
break
}
time.Sleep(500 * time.Millisecond)
retry++
}
if err != nil {
panic(err)
}
mds.nodeID = nodeID
}
func (mds *MockDataServer) start() {
listener, err := net.Listen("tcp", mds.TcpAddr)
if err != nil {
panic(err)
}
for {
conn, err := listener.Accept()
if err != nil {
panic(err)
}
go mds.serveConn(conn)
}
}
func (mds *MockDataServer) serveConn(rc net.Conn) {
conn, ok := rc.(*net.TCPConn)
if !ok {
rc.Close()
return
}
conn.SetKeepAlive(true)
conn.SetNoDelay(true)
proto.InitBufferPool(int64(32768))
req := proto.NewPacket()
err := req.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime)
if err != nil {
return
}
adminTask := &proto.AdminTask{}
decode := json.NewDecoder(bytes.NewBuffer(req.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
responseAckErrToMaster(conn, req, err)
return
}
switch req.Opcode {
case proto.OpCreateDataPartition:
err = mds.handleCreateDataPartition(conn, req, adminTask)
Printf("data node [%v] create data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
case proto.OpDeleteDataPartition:
err = mds.handleDeleteDataPartition(conn, req)
Printf("data node [%v] delete data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
case proto.OpDataNodeHeartbeat:
err = mds.handleHeartbeats(conn, req, adminTask)
Printf("data node [%v] report heartbeat to master,err:%v\n", mds.TcpAddr, err)
case proto.OpLoadDataPartition:
err = mds.handleLoadDataPartition(conn, req, adminTask)
Printf("data node [%v] load data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
case proto.OpDecommissionDataPartition:
err = mds.handleDecommissionDataPartition(conn, req, adminTask)
Printf("data node [%v] decommission data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
case proto.OpAddDataPartitionRaftMember:
err = mds.handleAddDataPartitionRaftMember(conn, req, adminTask)
Printf("data node [%v] add data partition raft member,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
case proto.OpRemoveDataPartitionRaftMember:
err = mds.handleRemoveDataPartitionRaftMember(conn, req, adminTask)
Printf("data node [%v] remove data partition raft member,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
case proto.OpDataPartitionTryToLeader:
err = mds.handleTryToLeader(conn, req, adminTask)
Printf("data node [%v] try to leader,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
default:
fmt.Printf("unknown code [%v]\n", req.Opcode)
}
}
func (mds *MockDataServer) handleAddDataPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
return
}
func (mds *MockDataServer) handleRemoveDataPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
return
}
func (mds *MockDataServer) handleTryToLeader(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
return
}
func (mds *MockDataServer) CheckVolPartition(name string, cond func(*MockDataPartition) bool) bool {
mds.RLock()
defer mds.RUnlock()
for _, dp := range mds.partitions {
if dp.VolName == name && !cond(dp) {
return false
}
}
return true
}
func (mds *MockDataServer) handleDecommissionDataPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
defer func() {
if err != nil {
responseAckErrToMaster(conn, p, err)
} else {
p.PacketOkWithBody([]byte("/cfs"))
p.WriteToConn(conn)
}
}()
// Marshal request body.
requestJson, err := json.Marshal(adminTask.Request)
if err != nil {
return
}
// Unmarshal request to entity
req := &proto.DataPartitionDecommissionRequest{}
if err = json.Unmarshal(requestJson, req); err != nil {
return
}
partitions := make([]*MockDataPartition, 0)
mds.RLock()
defer mds.RUnlock()
for index, dp := range mds.partitions {
if dp.PartitionID == req.PartitionId {
partitions = append(partitions, mds.partitions[:index]...)
partitions = append(partitions, mds.partitions[index+1:]...)
}
}
if len(partitions) != 0 {
mds.partitions = partitions
}
return
}
func (mds *MockDataServer) handleCreateDataPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
defer func() {
if err != nil {
responseAckErrToMaster(conn, p, err)
} else {
responseAckOKToMaster(conn, p, nil)
}
}()
// Marshal request body.
requestJson, err := json.Marshal(adminTask.Request)
if err != nil {
return
}
// Unmarshal request to entity
req := &proto.CreateDataPartitionRequest{}
if err = json.Unmarshal(requestJson, req); err != nil {
return
}
// Create new partition.
partition := &MockDataPartition{
PartitionID: req.PartitionId,
VolName: req.VolumeId,
total: req.PartitionSize,
used: defaultUsedSize,
}
partition.SetForbidden(false)
mds.Lock()
defer mds.Unlock()
mds.partitions = append(mds.partitions, partition)
return
}
func (mds *MockDataServer) checkVolumeForbidden(volNames []string, dp *MockDataPartition) {
for _, volName := range volNames {
if volName == dp.VolName {
dp.SetForbidden(true)
return
}
}
dp.SetForbidden(false)
}
// Handle OpHeartbeat packet.
func (mds *MockDataServer) handleHeartbeats(conn net.Conn, pkg *proto.Packet, task *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, pkg, nil)
response := &proto.DataNodeHeartbeatResponse{}
req := &proto.HeartBeatRequest{}
reqData, err := json.Marshal(task.Request)
if err != nil {
response.Status = proto.TaskFailed
response.Result = err.Error()
goto end
}
if err = json.Unmarshal(reqData, req); err != nil {
response.Status = proto.TaskFailed
response.Result = err.Error()
goto end
}
response.Status = proto.TaskSucceeds
response.Used = 5 * util.GB
response.Total = 1024 * util.GB
response.Available = 1024 * util.GB
response.CreatedPartitionCnt = 3
response.TotalPartitionSize = 120 * util.GB
response.MaxCapacity = 800 * util.GB
response.RemainingCapacity = 800 * util.GB
response.ZoneName = mds.zoneName
response.PartitionReports = make([]*proto.DataPartitionReport, 0)
mds.RLock()
for _, partition := range mds.partitions {
mds.checkVolumeForbidden(req.ForbiddenVols, partition)
vr := &proto.DataPartitionReport{
PartitionID: partition.PartitionID,
PartitionStatus: proto.ReadWrite,
Total: 120 * util.GB,
Used: defaultUsedSize,
DiskPath: "/cfs",
ExtentCount: 10,
NeedCompare: true,
IsLeader: true, // todo
VolName: partition.VolName,
}
response.PartitionReports = append(response.PartitionReports, vr)
}
mds.RUnlock()
task.Response = response
end:
if err = mds.mc.NodeAPI().ResponseDataNodeTask(task); err != nil {
return
}
return
}
func (mds *MockDataServer) handleDeleteDataPartition(conn net.Conn, pkg *proto.Packet) (err error) {
err = responseAckOKToMaster(conn, pkg, nil)
return
}
func (mds *MockDataServer) handleLoadDataPartition(conn net.Conn, pkg *proto.Packet, task *proto.AdminTask) (err error) {
if err = responseAckOKToMaster(conn, pkg, nil); err != nil {
return
}
// Marshal request body.
requestJson, err := json.Marshal(task.Request)
if err != nil {
return
}
// Unmarshal request to entity
req := &proto.LoadDataPartitionRequest{}
if err = json.Unmarshal(requestJson, req); err != nil {
return
}
partitionID := uint64(req.PartitionId)
response := &proto.LoadDataPartitionResponse{}
response.PartitionId = partitionID
response.Used = defaultUsedSize
response.PartitionSnapshot = buildSnapshot()
response.Status = proto.TaskSucceeds
var partition *MockDataPartition
mds.RLock()
for _, partition = range mds.partitions {
if partition.PartitionID == partitionID {
break
}
}
mds.RUnlock()
if partition == nil {
return
}
// response.VolName = partition.VolName
task.Response = response
if err = mds.mc.NodeAPI().ResponseDataNodeTask(task); err != nil {
return
}
return
}
func buildSnapshot() (files []*proto.File) {
files = make([]*proto.File, 0)
f1 := &proto.File{
Name: "1",
Crc: 4045512210,
Size: 2 * util.MB,
Modified: 1562507765,
}
files = append(files, f1)
f2 := &proto.File{
Name: "2",
Crc: 4045512210,
Size: 2 * util.MB,
Modified: 1562507765,
}
files = append(files, f2)
f3 := &proto.File{
Name: "50000010",
Crc: 4045512210,
Size: 2 * util.MB,
Modified: 1562507765,
}
files = append(files, f3)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package mocktest
import (
"bytes"
"encoding/json"
"fmt"
"net"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
)
type MockMetaServer struct {
NodeID uint64
TcpAddr string
ZoneName string
mc *master.MasterClient
partitions map[uint64]*MockMetaPartition // Key: metaRangeId, Val: metaPartition
sync.RWMutex
}
func NewMockMetaServer(addr string, zoneName string) *MockMetaServer {
mms := &MockMetaServer{
TcpAddr: addr, partitions: make(map[uint64]*MockMetaPartition, 0),
ZoneName: zoneName,
mc: master.NewMasterClient([]string{hostAddr}, false),
}
return mms
}
func (mms *MockMetaServer) Start() {
mms.register()
go mms.start()
}
func (mms *MockMetaServer) register() {
var err error
var nodeID uint64
var retry int
for retry < 3 {
nodeID, err = mms.mc.NodeAPI().AddMetaNode(mms.TcpAddr, mms.ZoneName)
if err == nil {
break
}
time.Sleep(500 * time.Millisecond)
retry++
}
if err != nil {
panic(err)
}
mms.NodeID = nodeID
}
func (mms *MockMetaServer) start() {
s := strings.Split(mms.TcpAddr, ColonSeparator)
listener, err := net.Listen("tcp", ":"+s[1])
if err != nil {
panic(err)
}
for {
conn, err := listener.Accept()
if err != nil {
fmt.Printf("accept conn occurred error,err is [%v]", err)
}
go mms.serveConn(conn)
}
}
func (mms *MockMetaServer) serveConn(rc net.Conn) {
Printf("remote[%v],local[%v]\n", rc.RemoteAddr(), rc.LocalAddr())
conn, ok := rc.(*net.TCPConn)
if !ok {
rc.Close()
return
}
conn.SetKeepAlive(true)
conn.SetNoDelay(true)
req := proto.NewPacket()
err := req.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime)
if err != nil {
fmt.Printf("remote [%v] err is [%v]\n", conn.RemoteAddr(), err)
return
}
Printf("remote [%v] req [%v]\n", conn.RemoteAddr(), req.GetOpMsg())
adminTask := &proto.AdminTask{}
decode := json.NewDecoder(bytes.NewBuffer(req.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
responseAckErrToMaster(conn, req, err)
return
}
switch req.Opcode {
case proto.OpCreateMetaPartition:
err = mms.handleCreateMetaPartition(conn, req, adminTask)
Printf("meta node [%v] create meta partition,err:%v\n", mms.TcpAddr, err)
case proto.OpMetaNodeHeartbeat:
err = mms.handleHeartbeats(conn, req, adminTask)
Printf("meta node [%v] heartbeat,err:%v\n", mms.TcpAddr, err)
case proto.OpDeleteMetaPartition:
err = mms.handleDeleteMetaPartition(conn, req, adminTask)
Printf("meta node [%v] delete meta partition,err:%v\n", mms.TcpAddr, err)
case proto.OpUpdateMetaPartition:
err = mms.handleUpdateMetaPartition(conn, req, adminTask)
Printf("meta node [%v] update meta partition,err:%v\n", mms.TcpAddr, err)
case proto.OpLoadMetaPartition:
err = mms.handleLoadMetaPartition(conn, req, adminTask)
Printf("meta node [%v] load meta partition,err:%v\n", mms.TcpAddr, err)
case proto.OpDecommissionMetaPartition:
err = mms.handleDecommissionMetaPartition(conn, req, adminTask)
Printf("meta node [%v] offline meta partition,err:%v\n", mms.TcpAddr, err)
case proto.OpAddMetaPartitionRaftMember:
err = mms.handleAddMetaPartitionRaftMember(conn, req, adminTask)
Printf("meta node [%v] add data partition raft member,id[%v],err:%v\n", mms.TcpAddr, adminTask.ID, err)
case proto.OpRemoveMetaPartitionRaftMember:
err = mms.handleRemoveMetaPartitionRaftMember(conn, req, adminTask)
Printf("meta node [%v] remove data partition raft member,id[%v],err:%v\n", mms.TcpAddr, adminTask.ID, err)
case proto.OpMetaPartitionTryToLeader:
err = mms.handleTryToLeader(conn, req, adminTask)
Printf("meta node [%v] try to leader,id[%v],err:%v\n", mms.TcpAddr, adminTask.ID, err)
default:
fmt.Printf("unknown code [%v]\n", req.Opcode)
}
}
func (mms *MockMetaServer) handleAddMetaPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
return
}
func (mms *MockMetaServer) handleRemoveMetaPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
return
}
func (mms *MockMetaServer) handleTryToLeader(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
mms.Lock()
mp := mms.partitions[adminTask.PartitionID]
for i := range mp.Replicas {
if mp.Replicas[i].IsLeader {
mp.Replicas[i].IsLeader = false
}
if mp.Replicas[i].Addr == adminTask.OperatorAddr {
mp.Replicas[i].IsLeader = true
}
}
mms.Unlock()
return
}
func (mms *MockMetaServer) CheckVolPartition(name string, cond func(*MockMetaPartition) bool) bool {
mms.RLock()
defer mms.RUnlock()
for _, mp := range mms.partitions {
if mp.VolName == name && !cond(mp) {
return false
}
}
return true
}
func (mms *MockMetaServer) handleCreateMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
defer func() {
if err != nil {
responseAckErrToMaster(conn, p, err)
} else {
responseAckOKToMaster(conn, p, nil)
}
}()
// Marshal request body.
requestJson, err := json.Marshal(adminTask.Request)
if err != nil {
return
}
// Unmarshal request to entity
req := &proto.CreateMetaPartitionRequest{}
if err = json.Unmarshal(requestJson, req); err != nil {
return
}
// Create new metaPartition.
replicas := make([]*MockMetaReplica, 0)
for i, member := range req.Members {
re := &MockMetaReplica{Addr: member.Addr, IsLeader: false}
// only set only leader,choose the first member as leader mp
if i == 0 {
re.IsLeader = true
}
replicas = append(replicas, re)
}
partition := &MockMetaPartition{
PartitionID: req.PartitionID,
VolName: req.VolName,
Start: req.Start,
End: req.End,
Cursor: req.Start,
Members: req.Members,
Replicas: replicas,
}
partition.SetEnableAuditLog(true)
partition.SetForbidden(false)
mms.Lock()
mms.partitions[req.PartitionID] = partition
mms.Unlock()
return
}
func (mms *MockMetaServer) checkForbiddenVolume(volNames []string, mp *MockMetaPartition) {
for _, volName := range volNames {
if mp.VolName == volName {
mp.SetForbidden(true)
return
}
}
mp.SetForbidden(false)
}
func (mms *MockMetaServer) checkAuditLogVolume(volNames []string, mp *MockMetaPartition) {
for _, volName := range volNames {
if mp.VolName == volName {
mp.SetEnableAuditLog(false)
return
}
}
mp.SetEnableAuditLog(true)
}
// Handle OpHeartbeat packet.
func (mms *MockMetaServer) handleHeartbeats(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
// For ack to master
responseAckOKToMaster(conn, p, nil)
var (
req = &proto.HeartBeatRequest{}
resp = &proto.MetaNodeHeartbeatResponse{}
reqData []byte
)
reqData, err = json.Marshal(adminTask.Request)
if err != nil {
resp.Status = proto.TaskFailed
resp.Result = err.Error()
goto end
}
if err = json.Unmarshal(reqData, req); err != nil {
resp.Status = proto.TaskFailed
resp.Result = err.Error()
goto end
}
resp.Total = 10 * util.GB
resp.MemUsed = 1 * util.GB
// every partition used
mms.RLock()
for id, partition := range mms.partitions {
mms.checkForbiddenVolume(req.ForbiddenVols, partition)
mms.checkAuditLogVolume(req.DisableAuditVols, partition)
mpr := &proto.MetaPartitionReport{
PartitionID: id,
Start: partition.Start,
End: partition.End,
Status: proto.ReadWrite,
MaxInodeID: partition.Start,
VolName: partition.VolName,
IsLeader: partition.isLeaderMetaNode(mms.TcpAddr),
}
mpr.Status = proto.ReadWrite
resp.MetaPartitionReports = append(resp.MetaPartitionReports, mpr)
}
mms.RUnlock()
resp.ZoneName = mms.ZoneName
resp.Status = proto.TaskSucceeds
end:
return mms.postResponseToMaster(adminTask, resp)
}
func (mms *MockMetaServer) postResponseToMaster(adminTask *proto.AdminTask, resp interface{}) (err error) {
adminTask.Request = nil
adminTask.Response = resp
if err = mms.mc.NodeAPI().ResponseMetaNodeTask(adminTask); err != nil {
return
}
return
}
func (mms *MockMetaServer) handleDeleteMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
req := &proto.DeleteMetaPartitionRequest{}
reqData, err := json.Marshal(adminTask.Request)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, nil)
responseAckErrToMaster(conn, p, err)
return
}
if err = json.Unmarshal(reqData, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, nil)
responseAckErrToMaster(conn, p, err)
return
}
resp := &proto.DeleteMetaPartitionResponse{
PartitionID: req.PartitionID,
Status: proto.TaskSucceeds,
}
return mms.postResponseToMaster(adminTask, resp)
}
func (mms *MockMetaServer) handleUpdateMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
req := &proto.UpdateMetaPartitionRequest{}
reqData, err := json.Marshal(adminTask.Request)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, nil)
responseAckErrToMaster(conn, p, err)
return
}
if err = json.Unmarshal(reqData, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, nil)
responseAckErrToMaster(conn, p, err)
return
}
resp := &proto.UpdateMetaPartitionResponse{
VolName: req.VolName,
PartitionID: req.PartitionID,
End: req.End,
}
mms.Lock()
partition := mms.partitions[req.PartitionID]
partition.End = req.End
mms.Unlock()
return mms.postResponseToMaster(adminTask, resp)
}
func (mms *MockMetaServer) handleLoadMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
var data []byte
defer func() {
if err != nil {
responseAckErrToMaster(conn, p, err)
} else {
responseAckOKToMaster(conn, p, data)
}
}()
req := &proto.MetaPartitionLoadRequest{}
reqData, err := json.Marshal(adminTask.Request)
if err != nil {
return
}
if err = json.Unmarshal(reqData, req); err != nil {
return
}
resp := &proto.MetaPartitionLoadResponse{
PartitionID: req.PartitionID,
DoCompare: true,
ApplyID: 100,
MaxInode: 123456,
DentryCount: 123456,
}
data, err = json.Marshal(resp)
if err != nil {
return
}
return
}
func (mms *MockMetaServer) handleDecommissionMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
responseAckOKToMaster(conn, p, nil)
req := &proto.MetaPartitionDecommissionRequest{}
reqData, err := json.Marshal(adminTask.Request)
if err != nil {
return
}
if err = json.Unmarshal(reqData, req); err != nil {
return
}
resp := &proto.MetaPartitionDecommissionResponse{
PartitionID: req.PartitionID,
VolName: req.VolName,
Status: proto.TaskSucceeds,
}
return mms.postResponseToMaster(adminTask, resp)
}
package mocktest
import (
"sync/atomic"
"github.com/cubefs/cubefs/proto"
)
type MockDataPartition struct {
PartitionID uint64
PersistenceHosts []string
total int
used uint64
VolName string
Forbidden int32
}
func (md *MockDataPartition) IsForbidden() bool {
return atomic.LoadInt32(&md.Forbidden) != 0
}
func (md *MockDataPartition) SetForbidden(status bool) {
val := 0
if status {
val = 1
}
atomic.StoreInt32(&md.Forbidden, int32(val))
}
type MockMetaPartition struct {
PartitionID uint64
Start uint64
End uint64
Status int8
Cursor uint64
VolName string
Members []proto.Peer
Replicas []*MockMetaReplica
Forbidden int32
EnableAuditLog int32
}
// MockMetaReplica defines the replica of a meta partition
type MockMetaReplica struct {
Addr string
start uint64 // lower bound of the inode id
end uint64 // upper bound of the inode id
dataSize uint64
nodeID uint64
MaxInodeID uint64
InodeCount uint64
DentryCount uint64
ReportTime int64
Status int8 // unavailable, readOnly, readWrite
IsLeader bool
}
func (mm *MockMetaPartition) isLeaderMetaNode(addr string) bool {
for _, mr := range mm.Replicas {
if mr.Addr == addr {
return mr.IsLeader
}
}
return false
}
func (mm *MockMetaPartition) IsEnableAuditLog() bool {
return atomic.LoadInt32(&mm.EnableAuditLog) != 0
}
func (mm *MockMetaPartition) SetEnableAuditLog(status bool) {
val := 0
if status {
val = 1
}
atomic.StoreInt32(&mm.EnableAuditLog, int32(val))
}
func (mm *MockMetaPartition) IsForbidden() bool {
return atomic.LoadInt32(&mm.Forbidden) != 0
}
func (mm *MockMetaPartition) SetForbidden(status bool) {
val := 0
if status {
val = 1
}
atomic.StoreInt32(&mm.Forbidden, int32(val))
}
package mocktest
import (
"bytes"
"fmt"
"io"
"net"
"net/http"
"os"
"testing"
"time"
"github.com/cubefs/cubefs/proto"
)
const (
ColonSeparator = ":"
hostAddr = "127.0.0.1:8080"
)
var (
LogOn = os.Getenv("DOCKER_TESTING_LOG_OFF") == ""
Print = fmt.Print
Printf = fmt.Printf
Println = fmt.Println
)
func init() {
if !LogOn {
SetOutput(io.Discard)
}
}
// SetOutput reset fmt output writer.
func SetOutput(w io.Writer) {
Print = func(a ...interface{}) (int, error) { return fmt.Fprint(w, a...) }
Printf = func(format string, a ...interface{}) (int, error) { return fmt.Fprintf(w, format, a...) }
Println = func(a ...interface{}) (int, error) { return fmt.Fprintln(w, a...) }
}
func Log(tb testing.TB, a ...interface{}) {
if LogOn {
tb.Log(a...)
}
}
func responseAckOKToMaster(conn net.Conn, p *proto.Packet, data []byte) error {
if len(data) != 0 {
p.PacketOkWithBody(data)
} else {
p.PacketOkReply()
}
return p.WriteToConn(conn)
}
func responseAckErrToMaster(conn net.Conn, p *proto.Packet, err error) error {
status := proto.OpErr
buf := []byte(err.Error())
p.PacketErrorWithBody(status, buf)
p.ResultCode = proto.TaskFailed
return p.WriteToConn(conn)
}
func PostToMaster(method, url string, reqData []byte) (resp *http.Response, err error) {
client := &http.Client{}
reader := bytes.NewReader(reqData)
client.Timeout = time.Second * 3
var req *http.Request
if req, err = http.NewRequest(method, url, reader); err != nil {
return
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Connection", "close")
resp, err = client.Do(req)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"strconv"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
// metrics
const (
StatPeriod = time.Minute * time.Duration(1)
MetricDataNodesUsedGB = "dataNodes_used_GB"
MetricDataNodesTotalGB = "dataNodes_total_GB"
MetricDataNodesIncreasedGB = "dataNodes_increased_GB"
MetricMetaNodesUsedGB = "metaNodes_used_GB"
MetricMetaNodesTotalGB = "metaNodes_total_GB"
MetricMetaNodesIncreasedGB = "metaNodes_increased_GB"
MetricDataNodesCount = "dataNodes_count"
MetricMetaNodesCount = "metaNodes_count"
MetricNodeStat = "node_stat"
MetricVolCount = "vol_count"
MetricVolTotalGB = "vol_total_GB"
MetricVolUsedGB = "vol_used_GB"
MetricVolUsageGB = "vol_usage_ratio"
MetricVolMetaCount = "vol_meta_count"
MetricBadMpCount = "bad_mp_count"
MetricBadDpCount = "bad_dp_count"
MetricDiskError = "disk_error"
MetricDataNodesInactive = "dataNodes_inactive"
MetricInactiveDataNodeInfo = "inactive_dataNodes_info"
MetricMetaNodesInactive = "metaNodes_inactive"
MetricDataNodesNotWritable = "dataNodes_not_writable"
MetricMetaNodesNotWritable = "metaNodes_not_writable"
MetricInactiveMataNodeInfo = "inactive_mataNodes_info"
MetricMetaInconsistent = "mp_inconsistent"
MetricMasterNoLeader = "master_no_leader"
MetricMasterNoCache = "master_no_cache"
MetricMasterSnapshot = "master_snapshot"
MetricMissingDp = "missing_dp"
MetricDpNoLeader = "dp_no_leader"
MetricMissingMp = "missing_mp"
MetricMpNoLeader = "mp_no_leader"
MetricDataPartitionCount = "dataPartition_count"
MetricReplicaMissingDPCount = "replica_missing_dp_count"
MetricDpMissingLeaderCount = "dp_missing_Leader_count"
MetricMpMissingLeaderCount = "mp_missing_Leader_count"
MetricDataNodesetInactiveCount = "data_nodeset_inactive_count"
MetricMetaNodesetInactiveCount = "meta_nodeset_inactive_count"
MetricNodesetMetaTotalGB = "nodeset_meta_total_GB"
MetricNodesetMetaUsedGB = "nodeset_meta_used_GB"
MetricNodesetMetaUsageRadio = "nodeset_meta_usage_ratio"
MetricNodesetDataTotalGB = "nodeset_data_total_GB"
MetricNodesetDataUsedGB = "nodeset_data_used_GB"
MetricNodesetDataUsageRadio = "nodeset_data_usage_ratio"
MetricNodesetMpReplicaCount = "nodeset_mp_replica_count"
MetricNodesetDpReplicaCount = "nodeset_dp_replica_count"
MetricLcNodesConcurrentCount = "lcNodes_concurrent"
Metrics3LcTotalScanned = "s3Lc_Total_Scanned"
Metrics3LcTotalFileScanned = "s3Lc_Total_File_Scanned"
Metrics3LcTotalDirScanned = "s3Lc_Total_DirS_canned"
Metrics3LcTotalExpired = "s3Lc_Total_Expired"
Metrics3LcAbortedMultipartUpload = "s3Lc_Aborted_Multipart_Upload"
MetricLcNodesCount = "lc_nodes_count"
MetricLcTotalScanned = "lc_total_scanned"
MetricLcTotalFileScanned = "lc_total_file_scanned"
MetricLcTotalDirScanned = "lc_total_dirs_scanned"
MetricLcTotalExpired = "lc_total_expired"
)
var WarnMetrics *warningMetrics
type monitorMetrics struct {
cluster *Cluster
dataNodesCount *exporter.Gauge
metaNodesCount *exporter.Gauge
volCount *exporter.Gauge
dataNodesTotal *exporter.Gauge
dataNodesUsed *exporter.Gauge
dataNodeIncreased *exporter.Gauge
metaNodesTotal *exporter.Gauge
metaNodesUsed *exporter.Gauge
metaNodesIncreased *exporter.Gauge
volTotalSpace *exporter.GaugeVec
volUsedSpace *exporter.GaugeVec
volUsage *exporter.GaugeVec
volMetaCount *exporter.GaugeVec
badMpCount *exporter.Gauge
badDpCount *exporter.Gauge
diskError *exporter.GaugeVec
dataNodesNotWritable *exporter.Gauge
metaNodesNotWritable *exporter.Gauge
dataNodesInactive *exporter.Gauge
InactiveDataNodeInfo *exporter.GaugeVec
metaNodesInactive *exporter.Gauge
InactiveMataNodeInfo *exporter.GaugeVec
dataPartitionCount *exporter.Gauge
ReplicaMissingDPCount *exporter.Gauge
DpMissingLeaderCount *exporter.Gauge
MpMissingLeaderCount *exporter.Gauge
dataNodesetInactiveCount *exporter.GaugeVec
metaNodesetInactiveCount *exporter.GaugeVec
metaEqualCheckFail *exporter.GaugeVec
masterNoLeader *exporter.Gauge
masterNoCache *exporter.GaugeVec
masterSnapshot *exporter.Gauge
nodesetMetaTotal *exporter.GaugeVec
nodesetMetaUsed *exporter.GaugeVec
nodesetMetaUsageRatio *exporter.GaugeVec
nodesetDataTotal *exporter.GaugeVec
nodesetDataUsed *exporter.GaugeVec
nodesetDataUsageRatio *exporter.GaugeVec
nodesetMpReplicaCount *exporter.GaugeVec
nodesetDpReplicaCount *exporter.GaugeVec
nodeStat *exporter.GaugeVec
volNames map[string]struct{}
badDisks map[string]string
nodesetInactiveDataNodesCount map[uint64]int64
nodesetInactiveMetaNodesCount map[uint64]int64
inconsistentMps map[string]string
nodesetIds map[uint64]string
lcNodesCount *exporter.Gauge
lcVolNames map[string]struct{}
lcTotalScanned *exporter.GaugeVec
lcTotalFileScanned *exporter.GaugeVec
lcTotalDirScanned *exporter.GaugeVec
lcTotalExpired *exporter.GaugeVec
}
func newMonitorMetrics(c *Cluster) *monitorMetrics {
return &monitorMetrics{
cluster: c,
volNames: make(map[string]struct{}),
badDisks: make(map[string]string),
nodesetInactiveDataNodesCount: make(map[uint64]int64),
nodesetInactiveMetaNodesCount: make(map[uint64]int64),
inconsistentMps: make(map[string]string),
lcVolNames: make(map[string]struct{}),
}
}
type voidType struct{}
var voidVal voidType
type addrSet struct {
addrs map[string]voidType // empty value of map does not occupy memory
replicaNum string
replicaAlive string
}
type warningMetrics struct {
cluster *Cluster
missingDp *exporter.GaugeVec
dpNoLeader *exporter.GaugeVec
missingMp *exporter.GaugeVec
mpNoLeader *exporter.GaugeVec
dpMutex sync.Mutex
mpMutex sync.Mutex
dpNoLeaderInfo map[uint64]int64
mpNoLeaderInfo map[uint64]int64
dpMissingReplicaMutex sync.Mutex
mpMissingReplicaMutex sync.Mutex
dpMissingReplicaInfo map[string]addrSet
mpMissingReplicaInfo map[string]addrSet
}
func newWarningMetrics(c *Cluster) *warningMetrics {
return &warningMetrics{
cluster: c,
missingDp: exporter.NewGaugeVec(MetricMissingDp, "", []string{"clusterName", "partitionID", "addr", "ReplicaAlive", "ReplicaNum"}),
dpNoLeader: exporter.NewGaugeVec(MetricDpNoLeader, "", []string{"clusterName", "partitionID"}),
missingMp: exporter.NewGaugeVec(MetricMissingMp, "", []string{"clusterName", "partitionID", "addr"}),
mpNoLeader: exporter.NewGaugeVec(MetricMpNoLeader, "", []string{"clusterName", "partitionID"}),
dpNoLeaderInfo: make(map[uint64]int64),
mpNoLeaderInfo: make(map[uint64]int64),
dpMissingReplicaInfo: make(map[string]addrSet),
mpMissingReplicaInfo: make(map[string]addrSet),
}
}
func (m *warningMetrics) reset() {
log.LogInfo("action[warningMetrics] reset all")
m.dpMutex.Lock()
for dp := range m.dpNoLeaderInfo {
m.dpNoLeader.DeleteLabelValues(m.cluster.Name, strconv.FormatUint(dp, 10))
delete(m.dpNoLeaderInfo, dp)
}
m.dpMutex.Unlock()
m.mpMutex.Lock()
for mp := range m.mpNoLeaderInfo {
m.mpNoLeader.DeleteLabelValues(m.cluster.Name, strconv.FormatUint(mp, 10))
delete(m.mpNoLeaderInfo, mp)
}
m.mpMutex.Unlock()
m.dpMissingReplicaMutex.Lock()
for id, dpAddrSet := range m.dpMissingReplicaInfo {
for addr := range dpAddrSet.addrs {
m.missingDp.DeleteLabelValues(m.cluster.Name, id, addr, dpAddrSet.replicaAlive, dpAddrSet.replicaNum)
}
delete(m.dpMissingReplicaInfo, id)
}
m.dpMissingReplicaMutex.Unlock()
m.mpMissingReplicaMutex.Lock()
for id, mpAddrSet := range m.mpMissingReplicaInfo {
for addr := range mpAddrSet.addrs {
m.missingMp.DeleteLabelValues(m.cluster.Name, id, addr)
}
delete(m.mpMissingReplicaInfo, id)
}
m.mpMissingReplicaMutex.Unlock()
}
// The caller is responsible for lock
func (m *warningMetrics) deleteMissingDp(missingDpAddrSet addrSet, clusterName, dpId, addr string) {
if len(missingDpAddrSet.addrs) == 0 {
return
}
if _, ok := missingDpAddrSet.addrs[addr]; !ok {
return
}
replicaAlive := m.dpMissingReplicaInfo[dpId].replicaAlive
replicaNum := m.dpMissingReplicaInfo[dpId].replicaNum
delete(missingDpAddrSet.addrs, addr)
if len(missingDpAddrSet.addrs) == 0 {
delete(m.dpMissingReplicaInfo, dpId)
}
m.missingDp.DeleteLabelValues(clusterName, dpId, addr, replicaAlive, replicaNum)
log.LogDebugf("action[deleteMissingDp] delete: dpId(%v), addr(%v)", dpId, addr)
}
// leader only
func (m *warningMetrics) WarnMissingDp(clusterName, addr string, partitionID uint64, report bool) {
m.dpMissingReplicaMutex.Lock()
defer m.dpMissingReplicaMutex.Unlock()
if clusterName != m.cluster.Name {
return
}
id := strconv.FormatUint(partitionID, 10)
if !report {
m.deleteMissingDp(m.dpMissingReplicaInfo[id], clusterName, id, addr)
return
}
// m.missingDp.SetWithLabelValues(1, clusterName, id, addr)
if _, ok := m.dpMissingReplicaInfo[id]; !ok {
m.dpMissingReplicaInfo[id] = addrSet{addrs: make(map[string]voidType)}
// m.dpMissingReplicaInfo[id].addrs = make(addrSet)
}
m.dpMissingReplicaInfo[id].addrs[addr] = voidVal
}
// leader only
func (m *warningMetrics) CleanObsoleteDpMissing(clusterName string, dp *DataPartition) {
m.dpMissingReplicaMutex.Lock()
defer m.dpMissingReplicaMutex.Unlock()
if clusterName != m.cluster.Name {
return
}
id := strconv.FormatUint(dp.PartitionID, 10)
missingRepAddrs, ok := m.dpMissingReplicaInfo[id]
if !ok {
return
}
for addr := range missingRepAddrs.addrs {
_, hasReplica := dp.hasReplica(addr)
hasHost := dp.hasHost(addr)
if !hasReplica && !hasHost {
log.LogDebugf("action[warningMetrics] delete obsolete dp missing record: dpId(%v), addr(%v)", id, addr)
m.deleteMissingDp(missingRepAddrs, clusterName, id, addr)
}
}
}
// leader only
func (m *warningMetrics) WarnDpNoLeader(clusterName string, partitionID uint64, report bool) {
if clusterName != m.cluster.Name {
return
}
m.dpMutex.Lock()
defer m.dpMutex.Unlock()
t, ok := m.dpNoLeaderInfo[partitionID]
if !report {
if ok {
delete(m.dpNoLeaderInfo, partitionID)
m.dpNoLeader.DeleteLabelValues(clusterName, strconv.FormatUint(partitionID, 10))
}
return
}
now := time.Now().Unix()
if !ok {
m.dpNoLeaderInfo[partitionID] = now
return
}
if now-t > m.cluster.cfg.DpNoLeaderReportIntervalSec {
m.dpNoLeader.SetWithLabelValues(1, clusterName, strconv.FormatUint(partitionID, 10))
m.dpNoLeaderInfo[partitionID] = now
}
}
// The caller is responsible for lock
func (m *warningMetrics) deleteMissingMp(missingMpAddrSet addrSet, clusterName, mpId, addr string) {
if len(missingMpAddrSet.addrs) == 0 {
return
}
if _, ok := missingMpAddrSet.addrs[addr]; !ok {
return
}
delete(missingMpAddrSet.addrs, addr)
if len(missingMpAddrSet.addrs) == 0 {
delete(m.mpMissingReplicaInfo, mpId)
}
m.missingMp.DeleteLabelValues(clusterName, mpId, addr)
log.LogDebugf("action[deleteMissingMp] delete: mpId(%v), addr(%v)", mpId, addr)
}
// leader only
func (m *warningMetrics) WarnMissingMp(clusterName, addr string, partitionID uint64, report bool) {
m.mpMissingReplicaMutex.Lock()
defer m.mpMissingReplicaMutex.Unlock()
if clusterName != m.cluster.Name {
return
}
id := strconv.FormatUint(partitionID, 10)
if !report {
m.deleteMissingMp(m.mpMissingReplicaInfo[id], clusterName, id, addr)
return
}
m.missingMp.SetWithLabelValues(1, clusterName, id, addr)
if _, ok := m.mpMissingReplicaInfo[id]; !ok {
m.dpMissingReplicaInfo[id] = addrSet{addrs: make(map[string]voidType)}
// m.mpMissingReplicaInfo[id] = make(addrSet)
}
m.mpMissingReplicaInfo[id].addrs[addr] = voidVal
}
// leader only
func (m *warningMetrics) CleanObsoleteMpMissing(clusterName string, mp *MetaPartition) {
m.mpMissingReplicaMutex.Lock()
defer m.mpMissingReplicaMutex.Unlock()
if clusterName != m.cluster.Name {
return
}
id := strconv.FormatUint(mp.PartitionID, 10)
missingRepAddrs, ok := m.mpMissingReplicaInfo[id]
if !ok {
return
}
for addr := range missingRepAddrs.addrs {
if _, err := mp.getMetaReplica(addr); err != nil {
log.LogDebugf("action[warningMetrics] delete obsolete Mp missing record: dpId(%v), addr(%v)", id, addr)
m.deleteMissingMp(missingRepAddrs, clusterName, id, addr)
}
}
}
// leader only
func (m *warningMetrics) WarnMpNoLeader(clusterName string, partitionID uint64, report bool) {
if clusterName != m.cluster.Name {
return
}
m.mpMutex.Lock()
defer m.mpMutex.Unlock()
t, ok := m.mpNoLeaderInfo[partitionID]
if !report {
if ok {
delete(m.mpNoLeaderInfo, partitionID)
m.mpNoLeader.DeleteLabelValues(clusterName, strconv.FormatUint(partitionID, 10))
}
return
}
now := time.Now().Unix()
if !ok {
m.mpNoLeaderInfo[partitionID] = now
return
}
if now-t > m.cluster.cfg.MpNoLeaderReportIntervalSec {
m.mpNoLeader.SetWithLabelValues(1, clusterName, strconv.FormatUint(partitionID, 10))
m.mpNoLeaderInfo[partitionID] = now
}
}
func (mm *monitorMetrics) start() {
mm.dataNodesTotal = exporter.NewGauge(MetricDataNodesTotalGB)
mm.dataNodesUsed = exporter.NewGauge(MetricDataNodesUsedGB)
mm.dataNodeIncreased = exporter.NewGauge(MetricDataNodesIncreasedGB)
mm.metaNodesTotal = exporter.NewGauge(MetricMetaNodesTotalGB)
mm.metaNodesUsed = exporter.NewGauge(MetricMetaNodesUsedGB)
mm.metaNodesIncreased = exporter.NewGauge(MetricMetaNodesIncreasedGB)
mm.dataNodesCount = exporter.NewGauge(MetricDataNodesCount)
mm.metaNodesCount = exporter.NewGauge(MetricMetaNodesCount)
mm.lcNodesCount = exporter.NewGauge(MetricLcNodesCount)
mm.volCount = exporter.NewGauge(MetricVolCount)
mm.volTotalSpace = exporter.NewGaugeVec(MetricVolTotalGB, "", []string{"volName"})
mm.volUsedSpace = exporter.NewGaugeVec(MetricVolUsedGB, "", []string{"volName"})
mm.volUsage = exporter.NewGaugeVec(MetricVolUsageGB, "", []string{"volName"})
mm.volMetaCount = exporter.NewGaugeVec(MetricVolMetaCount, "", []string{"volName", "type"})
mm.badMpCount = exporter.NewGauge(MetricBadMpCount)
mm.badDpCount = exporter.NewGauge(MetricBadDpCount)
mm.diskError = exporter.NewGaugeVec(MetricDiskError, "", []string{"addr", "path"})
mm.nodeStat = exporter.NewGaugeVec(MetricNodeStat, "", []string{"type", "addr", "stat"})
mm.dataNodesInactive = exporter.NewGauge(MetricDataNodesInactive)
mm.InactiveDataNodeInfo = exporter.NewGaugeVec(MetricInactiveDataNodeInfo, "", []string{"clusterName", "addr"})
mm.metaNodesInactive = exporter.NewGauge(MetricMetaNodesInactive)
mm.dataNodesNotWritable = exporter.NewGauge(MetricDataNodesNotWritable)
mm.metaNodesNotWritable = exporter.NewGauge(MetricMetaNodesNotWritable)
mm.InactiveMataNodeInfo = exporter.NewGaugeVec(MetricInactiveMataNodeInfo, "", []string{"clusterName", "addr"})
mm.dataPartitionCount = exporter.NewGauge(MetricDataPartitionCount)
mm.ReplicaMissingDPCount = exporter.NewGauge(MetricReplicaMissingDPCount)
mm.DpMissingLeaderCount = exporter.NewGauge(MetricDpMissingLeaderCount)
mm.MpMissingLeaderCount = exporter.NewGauge(MetricMpMissingLeaderCount)
mm.dataNodesetInactiveCount = exporter.NewGaugeVec(MetricDataNodesetInactiveCount, "", []string{"nodeset"})
mm.metaNodesetInactiveCount = exporter.NewGaugeVec(MetricMetaNodesetInactiveCount, "", []string{"nodeset"})
mm.metaEqualCheckFail = exporter.NewGaugeVec(MetricMetaInconsistent, "", []string{"volume", "mpId"})
mm.masterSnapshot = exporter.NewGauge(MetricMasterSnapshot)
mm.masterNoLeader = exporter.NewGauge(MetricMasterNoLeader)
mm.masterNoCache = exporter.NewGaugeVec(MetricMasterNoCache, "", []string{"volName"})
mm.nodesetMetaTotal = exporter.NewGaugeVec(MetricNodesetMetaTotalGB, "", []string{"nodeset"})
mm.nodesetMetaUsed = exporter.NewGaugeVec(MetricNodesetMetaUsedGB, "", []string{"nodeset"})
mm.nodesetMetaUsageRatio = exporter.NewGaugeVec(MetricNodesetMetaUsageRadio, "", []string{"nodeset"})
mm.nodesetDataTotal = exporter.NewGaugeVec(MetricNodesetDataTotalGB, "", []string{"nodeset"})
mm.nodesetDataUsed = exporter.NewGaugeVec(MetricNodesetDataUsedGB, "", []string{"nodeset"})
mm.nodesetDataUsageRatio = exporter.NewGaugeVec(MetricNodesetDataUsageRadio, "", []string{"nodeset"})
mm.nodesetMpReplicaCount = exporter.NewGaugeVec(MetricNodesetMpReplicaCount, "", []string{"nodeset"})
mm.nodesetDpReplicaCount = exporter.NewGaugeVec(MetricNodesetDpReplicaCount, "", []string{"nodeset"})
mm.lcNodesCount = exporter.NewGauge(MetricLcNodesCount)
mm.lcTotalScanned = exporter.NewGaugeVec(MetricLcTotalScanned, "", []string{"volName", "type"})
mm.lcTotalFileScanned = exporter.NewGaugeVec(MetricLcTotalFileScanned, "", []string{"volName", "type"})
mm.lcTotalDirScanned = exporter.NewGaugeVec(MetricLcTotalDirScanned, "", []string{"volName", "type"})
mm.lcTotalExpired = exporter.NewGaugeVec(MetricLcTotalExpired, "", []string{"volName", "type"})
go mm.statMetrics()
}
func (mm *monitorMetrics) statMetrics() {
ticker := time.NewTicker(StatPeriod)
defer func() {
if err := recover(); err != nil {
ticker.Stop()
log.LogErrorf("statMetrics panic,msg:%v", err)
}
}()
for {
select {
case <-ticker.C:
partition := mm.cluster.partition
if partition != nil && partition.IsRaftLeader() {
mm.resetFollowerMetrics()
mm.doStat()
} else {
mm.resetAllLeaderMetrics()
mm.doFollowerStat()
}
}
}
}
func (mm *monitorMetrics) doFollowerStat() {
if mm.cluster.leaderInfo.addr == "" {
mm.masterNoLeader.Set(1)
} else {
mm.masterNoLeader.Set(0)
}
if mm.cluster.fsm.onSnapshot {
mm.masterSnapshot.Set(1)
} else {
mm.masterSnapshot.Set(0)
}
mm.setVolNoCacheMetrics()
}
func (mm *monitorMetrics) doStat() {
dataNodeCount := mm.cluster.dataNodeCount()
mm.dataNodesCount.Set(float64(dataNodeCount))
metaNodeCount := mm.cluster.metaNodeCount()
mm.metaNodesCount.Set(float64(metaNodeCount))
lcNodeCount := mm.cluster.lcNodeCount()
mm.lcNodesCount.Set(float64(lcNodeCount))
volCount := len(mm.cluster.vols)
mm.volCount.Set(float64(volCount))
mm.dataNodesTotal.Set(float64(mm.cluster.dataNodeStatInfo.TotalGB))
mm.dataNodesUsed.Set(float64(mm.cluster.dataNodeStatInfo.UsedGB))
mm.dataNodeIncreased.Set(float64(mm.cluster.dataNodeStatInfo.IncreasedGB))
mm.metaNodesTotal.Set(float64(mm.cluster.metaNodeStatInfo.TotalGB))
mm.metaNodesUsed.Set(float64(mm.cluster.metaNodeStatInfo.UsedGB))
mm.metaNodesIncreased.Set(float64(mm.cluster.metaNodeStatInfo.IncreasedGB))
mm.setVolMetrics()
mm.setBadPartitionMetrics()
mm.setDiskErrorMetric()
mm.setNotWritableDataNodesCount()
mm.setNotWritableMetaNodesCount()
mm.setMpInconsistentErrorMetric()
mm.setMpAndDpMetrics()
mm.setNodesetMetrics()
mm.setLcMetrics()
mm.updateDataNodesStat()
mm.updateMetaNodesStat()
}
func (mm *monitorMetrics) setMpAndDpMetrics() {
dpCount := 0
dpMissingReplicaDpCount := 0
dpMissingLeaderCount := 0
mpMissingLeaderCount := 0
vols := mm.cluster.copyVols()
for _, vol := range vols {
if vol.Status == proto.VolStatusMarkDelete {
continue
}
var dps *DataPartitionMap
dps = vol.dataPartitions
dpCount += len(dps.partitions)
for _, dp := range dps.partitions {
if dp.ReplicaNum > uint8(len(dp.liveReplicas(defaultDataPartitionTimeOutSec))) {
dpMissingReplicaDpCount++
}
if proto.IsNormalDp(dp.PartitionType) && dp.getLeaderAddr() == "" {
dpMissingLeaderCount++
}
}
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
if !mp.isLeaderExist() {
mpMissingLeaderCount++
}
}
vol.mpsLock.RUnlock()
}
mm.dataPartitionCount.Set(float64(dpCount))
mm.ReplicaMissingDPCount.Set(float64(dpMissingReplicaDpCount))
mm.DpMissingLeaderCount.Set(float64(dpMissingLeaderCount))
mm.MpMissingLeaderCount.Set(float64(mpMissingLeaderCount))
return
}
func (mm *monitorMetrics) setVolNoCacheMetrics() {
deleteVolNames := make(map[string]struct{})
ObsoleteVVolNames := make(map[string]struct{})
mm.cluster.followerReadManager.rwMutex.RLock()
for volName, stat := range mm.cluster.followerReadManager.status {
if mm.cluster.followerReadManager.isVolRecordObsolete(volName) {
deleteVolNames[volName] = struct{}{}
ObsoleteVVolNames[volName] = struct{}{}
log.LogDebugf("setVolNoCacheMetrics: to deleteVolNames volName %v for vol becomes obsolete", volName)
continue
}
if stat == true {
deleteVolNames[volName] = struct{}{}
log.LogDebugf("setVolNoCacheMetrics: to deleteVolNames volName %v for status becomes ok", volName)
continue
}
log.LogWarnf("setVolNoCacheMetrics volName %v", volName)
mm.masterNoCache.SetWithLabelValues(1, volName)
}
mm.cluster.followerReadManager.rwMutex.RUnlock()
for volName := range deleteVolNames {
mm.masterNoCache.DeleteLabelValues(volName)
}
mm.cluster.followerReadManager.DelObsoleteVolRecord(ObsoleteVVolNames)
}
func (mm *monitorMetrics) setVolMetrics() {
deleteVolNames := make(map[string]struct{})
for k, v := range mm.volNames {
deleteVolNames[k] = v
delete(mm.volNames, k)
}
mm.cluster.volStatInfo.Range(func(key, value interface{}) bool {
volStatInfo, ok := value.(*volStatInfo)
if !ok {
return true
}
volName, ok := key.(string)
if !ok {
return true
}
mm.volNames[volName] = struct{}{}
if _, ok := deleteVolNames[volName]; ok {
delete(deleteVolNames, volName)
}
mm.volTotalSpace.SetWithLabelValues(float64(volStatInfo.TotalSize)/float64(util.GB), volName)
mm.volUsedSpace.SetWithLabelValues(float64(volStatInfo.UsedSize)/float64(util.GB), volName)
usedRatio, e := strconv.ParseFloat(volStatInfo.UsedRatio, 64)
if e == nil {
mm.volUsage.SetWithLabelValues(usedRatio, volName)
}
if usedRatio > volWarnUsedRatio {
WarnBySpecialKey("vol size used too high", fmt.Sprintf("vol: %v(total: %v, used: %v) has used(%v) to be full", volName, volStatInfo.TotalSize, volStatInfo.UsedRatio, volStatInfo.UsedSize))
}
return true
})
for volName, vol := range mm.cluster.allVols() {
inodeCount := uint64(0)
dentryCount := uint64(0)
mpCount := uint64(0)
freeListLen := uint64(0)
for _, mpv := range vol.getMetaPartitionsView() {
inodeCount += mpv.InodeCount
dentryCount += mpv.DentryCount
mpCount += 1
freeListLen += mpv.FreeListLen
}
mm.volMetaCount.SetWithLabelValues(float64(inodeCount), volName, "inode")
mm.volMetaCount.SetWithLabelValues(float64(dentryCount), volName, "dentry")
mm.volMetaCount.SetWithLabelValues(float64(mpCount), volName, "mp")
mm.volMetaCount.SetWithLabelValues(float64(vol.getDataPartitionsCount()), volName, "dp")
mm.volMetaCount.SetWithLabelValues(float64(freeListLen), volName, "freeList")
}
for volName := range deleteVolNames {
mm.deleteVolMetric(volName)
}
}
func (mm *monitorMetrics) setBadPartitionMetrics() {
badMpCount := uint64(0)
mm.cluster.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
badMpCount += uint64(len(value.([]uint64)))
return true
})
mm.badMpCount.SetWithLabels(float64(badMpCount), map[string]string{"type": "bad_mp"})
badDpCount := uint64(0)
mm.cluster.BadDataPartitionIds.Range(func(key, value interface{}) bool {
badDpCount += uint64(len(value.([]uint64)))
return true
})
mm.badDpCount.SetWithLabels(float64(badDpCount), map[string]string{"type": "bad_dp"})
}
func (mm *monitorMetrics) deleteVolMetric(volName string) {
mm.volTotalSpace.DeleteLabelValues(volName)
mm.volUsedSpace.DeleteLabelValues(volName)
mm.volUsage.DeleteLabelValues(volName)
mm.volMetaCount.DeleteLabelValues(volName, "inode")
mm.volMetaCount.DeleteLabelValues(volName, "dentry")
mm.volMetaCount.DeleteLabelValues(volName, "mp")
mm.volMetaCount.DeleteLabelValues(volName, "dp")
mm.volMetaCount.DeleteLabelValues(volName, "freeList")
}
func (mm *monitorMetrics) setMpInconsistentErrorMetric() {
deleteMps := make(map[string]string)
for k, v := range mm.inconsistentMps {
deleteMps[k] = v
delete(mm.inconsistentMps, k)
}
mm.cluster.volMutex.RLock()
defer mm.cluster.volMutex.RUnlock()
for _, vol := range mm.cluster.vols {
if vol.Status == proto.VolStatusMarkDelete {
continue
}
vol.mpsLock.RLock()
for _, mp := range vol.MetaPartitions {
if mp.IsRecover || mp.EqualCheckPass {
continue
}
idStr := strconv.FormatUint(mp.PartitionID, 10)
mm.metaEqualCheckFail.SetWithLabelValues(1, vol.Name, idStr)
mm.inconsistentMps[idStr] = vol.Name
log.LogWarnf("setMpInconsistentErrorMetric.mp %v SetWithLabelValues id %v vol %v", mp.PartitionID, idStr, vol.Name)
delete(deleteMps, idStr)
}
vol.mpsLock.RUnlock()
}
for k, v := range deleteMps {
mm.metaEqualCheckFail.DeleteLabelValues(v, k)
}
}
func (mm *monitorMetrics) setDiskErrorMetric() {
// key: addr_diskpath, val: addr
deleteBadDisks := make(map[string]string)
for k, v := range mm.badDisks {
deleteBadDisks[k] = v
delete(mm.badDisks, k)
}
mm.cluster.dataNodes.Range(func(addr, node interface{}) bool {
dataNode, ok := node.(*DataNode)
if !ok {
return true
}
for _, badDisk := range dataNode.BadDisks {
for _, partition := range dataNode.DataPartitionReports {
if partition.DiskPath == badDisk {
key := fmt.Sprintf("%s_%s", dataNode.Addr, badDisk)
mm.diskError.SetWithLabelValues(1, dataNode.Addr, key)
mm.badDisks[key] = dataNode.Addr
delete(deleteBadDisks, key)
break
}
}
}
return true
})
for k, v := range deleteBadDisks {
mm.diskError.DeleteLabelValues(v, k)
}
}
func (mm *monitorMetrics) updateMetaNodesStat() {
var inactiveMetaNodesCount int64
deleteNodesetCount := make(map[uint64]int64)
for k, v := range mm.nodesetInactiveMetaNodesCount {
deleteNodesetCount[k] = v
delete(mm.nodesetInactiveMetaNodesCount, k)
}
mm.cluster.metaNodes.Range(func(addr, node interface{}) bool {
metaNode, ok := node.(*MetaNode)
if !ok {
return true
}
if !metaNode.IsActive {
inactiveMetaNodesCount++
mm.InactiveMataNodeInfo.SetWithLabelValues(1, mm.cluster.Name, metaNode.Addr)
mm.nodesetInactiveMetaNodesCount[metaNode.NodeSetID] = mm.nodesetInactiveMetaNodesCount[metaNode.NodeSetID] + 1
delete(deleteNodesetCount, metaNode.NodeSetID)
} else {
mm.InactiveMataNodeInfo.DeleteLabelValues(mm.cluster.Name, metaNode.Addr)
}
mm.nodeStat.SetWithLabelValues(metaNode.Ratio, MetricRoleMetaNode, metaNode.Addr, "usageRatio")
mm.nodeStat.SetWithLabelValues(float64(metaNode.Total), MetricRoleMetaNode, metaNode.Addr, "memTotal")
mm.nodeStat.SetWithLabelValues(float64(metaNode.Used), MetricRoleMetaNode, metaNode.Addr, "memUsed")
mm.nodeStat.SetWithLabelValues(float64(metaNode.MetaPartitionCount), MetricRoleMetaNode, metaNode.Addr, "mpCount")
mm.nodeStat.SetWithLabelValues(float64(metaNode.Threshold), MetricRoleMetaNode, metaNode.Addr, "threshold")
mm.nodeStat.SetBoolWithLabelValues(metaNode.isWritable(), MetricRoleMetaNode, metaNode.Addr, "writable")
mm.nodeStat.SetBoolWithLabelValues(metaNode.IsActive, MetricRoleMetaNode, metaNode.Addr, "active")
return true
})
mm.metaNodesInactive.Set(float64(inactiveMetaNodesCount))
for id, count := range mm.nodesetInactiveMetaNodesCount {
mm.metaNodesetInactiveCount.SetWithLabelValues(float64(count), strconv.FormatUint(id, 10))
}
for k := range deleteNodesetCount {
mm.metaNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
}
}
func (mm *monitorMetrics) clearInactiveMetaNodesCountMetric() {
for k := range mm.nodesetInactiveMetaNodesCount {
mm.metaNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
}
}
func (mm *monitorMetrics) updateDataNodesStat() {
var inactiveDataNodesCount uint64
deleteNodesetCount := make(map[uint64]int64)
for k, v := range mm.nodesetInactiveDataNodesCount {
log.LogErrorf("setInactiveDataNodesCountMetric, init deleteNodesetCount")
deleteNodesetCount[k] = v
delete(mm.nodesetInactiveDataNodesCount, k)
}
mm.cluster.dataNodes.Range(func(addr, node interface{}) bool {
dataNode, ok := node.(*DataNode)
if !ok {
return true
}
if !dataNode.isActive {
inactiveDataNodesCount++
mm.InactiveDataNodeInfo.SetWithLabelValues(1, mm.cluster.Name, dataNode.Addr)
mm.nodesetInactiveDataNodesCount[dataNode.NodeSetID] = mm.nodesetInactiveDataNodesCount[dataNode.NodeSetID] + 1
delete(deleteNodesetCount, dataNode.NodeSetID)
} else {
mm.InactiveDataNodeInfo.DeleteLabelValues(mm.cluster.Name, dataNode.Addr)
}
mm.nodeStat.SetWithLabelValues(float64(dataNode.DataPartitionCount), MetricRoleDataNode, dataNode.Addr, "dpCount")
mm.nodeStat.SetWithLabelValues(float64(dataNode.Total), MetricRoleDataNode, dataNode.Addr, "diskTotal")
mm.nodeStat.SetWithLabelValues(float64(dataNode.Used), MetricRoleDataNode, dataNode.Addr, "diskUsed")
mm.nodeStat.SetWithLabelValues(float64(dataNode.AvailableSpace), MetricRoleDataNode, dataNode.Addr, "diskAvail")
mm.nodeStat.SetWithLabelValues(dataNode.UsageRatio, MetricRoleDataNode, dataNode.Addr, "usageRatio")
mm.nodeStat.SetWithLabelValues(float64(len(dataNode.BadDisks)), MetricRoleDataNode, dataNode.Addr, "badDiskCount")
mm.nodeStat.SetBoolWithLabelValues(dataNode.isActive, MetricRoleDataNode, dataNode.Addr, "active")
mm.nodeStat.SetBoolWithLabelValues(dataNode.isWriteAble(), MetricRoleDataNode, dataNode.Addr, "writable")
return true
})
mm.dataNodesInactive.Set(float64(inactiveDataNodesCount))
for id, count := range mm.nodesetInactiveDataNodesCount {
mm.dataNodesetInactiveCount.SetWithLabelValues(float64(count), strconv.FormatUint(id, 10))
}
for k := range deleteNodesetCount {
mm.dataNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
}
}
func (mm *monitorMetrics) clearInactiveDataNodesCountMetric() {
for k := range mm.nodesetInactiveDataNodesCount {
mm.dataNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
}
}
func (mm *monitorMetrics) setNotWritableMetaNodesCount() {
var notWritabelMetaNodesCount int64
mm.cluster.metaNodes.Range(func(addr, node interface{}) bool {
metaNode, ok := node.(*MetaNode)
if !ok {
return true
}
if !metaNode.isWritable() {
notWritabelMetaNodesCount++
}
return true
})
mm.metaNodesNotWritable.Set(float64(notWritabelMetaNodesCount))
}
func (mm *monitorMetrics) setNotWritableDataNodesCount() {
var notWritabelDataNodesCount int64
mm.cluster.dataNodes.Range(func(addr, node interface{}) bool {
dataNode, ok := node.(*DataNode)
if !ok {
return true
}
if !dataNode.isWriteAble() {
notWritabelDataNodesCount++
}
return true
})
mm.dataNodesNotWritable.Set(float64(notWritabelDataNodesCount))
}
func (mm *monitorMetrics) clearInconsistentMps() {
for k := range mm.inconsistentMps {
mm.dataNodesetInactiveCount.DeleteLabelValues(k)
}
}
func (mm *monitorMetrics) deleteS3LcVolMetric(volName string) {
mm.lcTotalScanned.DeleteLabelValues(volName, "total")
mm.lcTotalFileScanned.DeleteLabelValues(volName, "file")
mm.lcTotalDirScanned.DeleteLabelValues(volName, "dir")
mm.lcTotalExpired.DeleteLabelValues(volName, "expired")
}
func (mm *monitorMetrics) setLcMetrics() {
lcTaskStatus := mm.cluster.lcMgr.lcRuleTaskStatus
volumeScanStatistics := make(map[string]proto.LcNodeRuleTaskStatistics, 0)
lcTaskStatus.RLock()
for _, r := range lcTaskStatus.Results {
key := r.Volume + "[" + r.RuleId + "]"
if _, ok := volumeScanStatistics[key]; ok && r.Done {
volumeScanStatistics[key] = proto.LcNodeRuleTaskStatistics{}
} else {
volumeScanStatistics[key] = r.LcNodeRuleTaskStatistics
}
}
lcTaskStatus.RUnlock()
for key, stat := range volumeScanStatistics {
mm.lcVolNames[key] = struct{}{}
mm.lcTotalScanned.SetWithLabelValues(float64(stat.TotalInodeScannedNum), key, "total")
mm.lcTotalFileScanned.SetWithLabelValues(float64(stat.FileScannedNum), key, "file")
mm.lcTotalDirScanned.SetWithLabelValues(float64(stat.DirScannedNum), key, "dir")
mm.lcTotalExpired.SetWithLabelValues(float64(stat.ExpiredNum), key, "expired")
}
}
func (mm *monitorMetrics) clearLcMetrics() {
for vol := range mm.lcVolNames {
mm.deleteS3LcVolMetric(vol)
delete(mm.lcVolNames, vol)
}
}
func (mm *monitorMetrics) clearVolMetrics() {
mm.cluster.volStatInfo.Range(func(key, value interface{}) bool {
if volName, ok := key.(string); ok {
mm.deleteVolMetric(volName)
}
return true
})
}
func (mm *monitorMetrics) clearDiskErrMetrics() {
for k, v := range mm.badDisks {
mm.diskError.DeleteLabelValues(v, k)
}
}
func (mm *monitorMetrics) setNodesetMetrics() {
deleteNodesetIds := make(map[uint64]string)
for k, v := range mm.nodesetIds {
deleteNodesetIds[k] = v
}
mm.nodesetIds = make(map[uint64]string)
zones := mm.cluster.t.getAllZones()
for _, zone := range zones {
nodeSets := zone.getAllNodeSet()
for _, nodeset := range nodeSets {
var metaTotal, metaUsed, dataTotal, dataUsed uint64
var mpReplicasCount, dpReplicasCount int
nodeset.metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
metaTotal += metaNode.Total
metaUsed += metaNode.Used
mpReplicasCount += metaNode.MetaPartitionCount
return true
})
nodeset.dataNodes.Range(func(ney, value interface{}) bool {
dataNode := value.(*DataNode)
dataTotal += dataNode.Total
dataUsed += dataNode.Used
dpReplicasCount += int(dataNode.DataPartitionCount)
return true
})
nodesetId := strconv.FormatUint(nodeset.ID, 10)
mm.nodesetIds[nodeset.ID] = nodesetId
delete(deleteNodesetIds, nodeset.ID)
mm.nodesetMetaTotal.SetWithLabelValues(float64(metaTotal)/util.GB, nodesetId)
mm.nodesetMetaUsed.SetWithLabelValues(float64(metaUsed)/util.GB, nodesetId)
mm.nodesetDataTotal.SetWithLabelValues(float64(dataTotal)/util.GB, nodesetId)
mm.nodesetDataUsed.SetWithLabelValues(float64(dataUsed)/util.GB, nodesetId)
if metaTotal == 0 {
mm.nodesetMetaUsageRatio.SetWithLabelValues(0, nodesetId)
} else {
mm.nodesetMetaUsageRatio.SetWithLabelValues(float64(metaUsed)/float64(metaTotal), nodesetId)
}
if dataTotal == 0 {
mm.nodesetDataUsageRatio.SetWithLabelValues(0, nodesetId)
} else {
mm.nodesetDataUsageRatio.SetWithLabelValues(float64(dataUsed)/float64(dataTotal), nodesetId)
}
mm.nodesetMpReplicaCount.SetWithLabelValues(float64(mpReplicasCount), nodesetId)
mm.nodesetDpReplicaCount.SetWithLabelValues(float64(dpReplicasCount), nodesetId)
}
}
for _, v := range deleteNodesetIds {
mm.deleteNodesetMetric(v)
}
}
func (mm *monitorMetrics) deleteNodesetMetric(nodesetId string) {
mm.nodesetMetaTotal.DeleteLabelValues(nodesetId)
mm.nodesetMetaUsed.DeleteLabelValues(nodesetId)
mm.nodesetMetaUsageRatio.DeleteLabelValues(nodesetId)
mm.nodesetDataTotal.DeleteLabelValues(nodesetId)
mm.nodesetDataUsed.DeleteLabelValues(nodesetId)
mm.nodesetDataUsageRatio.DeleteLabelValues(nodesetId)
mm.nodesetMpReplicaCount.DeleteLabelValues(nodesetId)
mm.nodesetDpReplicaCount.DeleteLabelValues(nodesetId)
}
func (mm *monitorMetrics) clearNodesetMetrics() {
zones := mm.cluster.t.getAllZones()
for _, zone := range zones {
nodeSets := zone.getAllNodeSet()
for _, nodeset := range nodeSets {
mm.deleteNodesetMetric(strconv.FormatUint(nodeset.ID, 10))
}
}
}
func (mm *monitorMetrics) resetFollowerMetrics() {
mm.masterNoCache.GaugeVec.Reset()
mm.masterNoLeader.Set(0)
mm.masterSnapshot.Set(0)
}
func (mm *monitorMetrics) resetAllLeaderMetrics() {
mm.clearVolMetrics()
mm.clearDiskErrMetrics()
mm.clearInactiveMetaNodesCountMetric()
mm.clearInactiveDataNodesCountMetric()
mm.clearInconsistentMps()
mm.clearNodesetMetrics()
mm.clearLcMetrics()
mm.dataNodesCount.Set(0)
mm.metaNodesCount.Set(0)
mm.lcNodesCount.Set(0)
mm.volCount.Set(0)
mm.dataNodesTotal.Set(0)
mm.dataNodesUsed.Set(0)
mm.dataNodeIncreased.Set(0)
mm.metaNodesTotal.Set(0)
mm.metaNodesUsed.Set(0)
mm.metaNodesIncreased.Set(0)
// mm.diskError.Set(0)
mm.dataNodesInactive.Set(0)
mm.metaNodesInactive.Set(0)
mm.dataNodesNotWritable.Set(0)
mm.metaNodesNotWritable.Set(0)
mm.dataPartitionCount.Set(0)
mm.ReplicaMissingDPCount.Set(0)
mm.MpMissingLeaderCount.Set(0)
mm.DpMissingLeaderCount.Set(0)
}
package master
import (
"encoding/json"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
type Ver2PhaseCommit struct {
op uint8
prepareInfo *proto.VolVersionInfo
commitCnt uint32
nodeCnt uint32
dataNodeArray *sync.Map
metaNodeArray *sync.Map
}
func (commit *Ver2PhaseCommit) String() string {
return fmt.Sprintf("prepareCommit:(op[%v] commitCnt[%v],nodeCnt[%v] info[%v])",
commit.op, commit.commitCnt, commit.nodeCnt, commit.prepareInfo)
}
func (commit *Ver2PhaseCommit) reset(volName string) {
commit.op = 0
commit.commitCnt = 0
commit.nodeCnt = 0
// datanode and metanode will not allow change member during make snapshot
commit.dataNodeArray = new(sync.Map)
commit.metaNodeArray = new(sync.Map)
log.LogDebugf("action[Ver2PhaseCommit.reset] vol name %v", volName)
}
type VolVersionPersist struct {
MultiVersionList []*proto.VolVersionInfo
Strategy proto.VolumeVerStrategy
VerSeq uint64
}
type VolVersionManager struct {
// ALL snapshots not include deleted one,deleted one should write in error log
multiVersionList []*proto.VolVersionInfo
vol *Vol
prepareCommit *Ver2PhaseCommit
status uint32
wait chan error
cancel chan bool
verSeq uint64
enabled bool
strategy proto.VolumeVerStrategy
checkStrategy int32
checkStatus int32
c *Cluster
enableMiddleOp bool
sync.RWMutex
}
func newVersionMgr(vol *Vol) (mgr *VolVersionManager) {
mgr = &VolVersionManager{
vol: vol,
wait: make(chan error, 1),
cancel: make(chan bool, 1),
prepareCommit: &Ver2PhaseCommit{
dataNodeArray: new(sync.Map),
metaNodeArray: new(sync.Map),
},
}
return
}
func (verMgr *VolVersionManager) String() string {
return fmt.Sprintf("mgr:{vol[%v],status[%v] verSeq [%v], prepareinfo [%v], verlist [%v]}",
verMgr.vol.Name, verMgr.status, verMgr.verSeq, verMgr.prepareCommit, verMgr.multiVersionList)
}
func (verMgr *VolVersionManager) Persist() (err error) {
persistInfo := &VolVersionPersist{
MultiVersionList: verMgr.multiVersionList,
Strategy: verMgr.strategy,
VerSeq: verMgr.verSeq,
}
var val []byte
if val, err = json.Marshal(persistInfo); err != nil {
return
}
if verMgr.c == nil {
log.LogErrorf("vol %v cluster nil", verMgr.vol.Name)
return fmt.Errorf("persist vol %v cluster nil", verMgr.vol.Name)
}
if err = verMgr.c.syncMultiVersion(verMgr.vol, val); err != nil {
return
}
return
}
func (verMgr *VolVersionManager) loadMultiVersion(c *Cluster, val []byte) (err error) {
persistInfo := &VolVersionPersist{}
if err = json.Unmarshal(val, persistInfo); err != nil {
return
}
verMgr.multiVersionList = persistInfo.MultiVersionList
verMgr.verSeq = persistInfo.VerSeq
verMgr.strategy = persistInfo.Strategy
return nil
}
func (verMgr *VolVersionManager) CommitVer() (ver *proto.VolVersionInfo) {
log.LogDebugf("action[CommitVer] op %v vol %v %v", verMgr.prepareCommit.op, verMgr.vol.Name, verMgr)
if verMgr.prepareCommit.op == proto.CreateVersionPrepare {
ver = verMgr.prepareCommit.prepareInfo
commitVer := &proto.VolVersionInfo{
Ver: ver.Ver,
Status: proto.VersionNormal,
}
verMgr.multiVersionList = append(verMgr.multiVersionList, commitVer)
verMgr.verSeq = ver.Ver
log.LogInfof("action[CommitVer] vol %v verseq %v exit", verMgr.vol.Name, verMgr.verSeq)
if err := verMgr.Persist(); err != nil {
log.LogErrorf("action[createVer2PhaseTask] vol %v err %v", verMgr.vol.Name, err)
return
}
log.LogDebugf("action[CommitVer] vol %v ask mgr do commit in next step version %v", verMgr.vol.Name, ver)
verMgr.wait <- nil
} else if verMgr.prepareCommit.op == proto.DeleteVersion {
idx, found := verMgr.getLayInfo(verMgr.prepareCommit.prepareInfo.Ver)
if !found {
log.LogErrorf("action[CommitVer] vol %v not found seq %v in list but commit", verMgr.vol.Name, verMgr.prepareCommit.prepareInfo.Ver)
return
}
verMgr.multiVersionList[idx].Status = proto.VersionDeleting
verMgr.multiVersionList[idx].DelTime = time.Now().Unix()
verMgr.wait <- nil
} else {
log.LogErrorf("action[CommitVer] vol %v with seq %v wrong step", verMgr.vol.Name, verMgr.prepareCommit.prepareInfo.Ver)
}
return
}
func (verMgr *VolVersionManager) GenerateVer(verSeq uint64, op uint8) (err error) {
log.LogInfof("action[GenerateVer] vol %v enter verseq %v", verMgr.vol.Name, verSeq)
verMgr.Lock()
defer verMgr.Unlock()
tm := time.Now()
verMgr.enabled = true
if len(verMgr.multiVersionList) > MaxSnapshotCount {
err = fmt.Errorf("too much version exceed %v in list", MaxSnapshotCount)
log.LogWarnf("action[GenerateVer] vol %v err %v", verMgr.vol.Name, err)
return
}
verMgr.prepareCommit.reset(verMgr.vol.Name)
verMgr.prepareCommit.prepareInfo = &proto.VolVersionInfo{
Ver: verSeq,
Status: proto.VersionNormal,
}
verMgr.prepareCommit.op = op
size := len(verMgr.multiVersionList)
if size > 0 && !tm.After(time.Unix(int64(verMgr.multiVersionList[size-1].Ver)/1e6, 0)) {
verMgr.prepareCommit.prepareInfo.Ver = uint64(verMgr.multiVersionList[size-1].Ver) + 1
log.LogDebugf("action[GenerateVer] vol %v use ver %v", verMgr.vol.Name, verMgr.prepareCommit.prepareInfo.Ver)
}
log.LogDebugf("action[GenerateVer] vol %v exit", verMgr.vol.Name)
return
}
func (verMgr *VolVersionManager) DelVer(verSeq uint64) (err error) {
verMgr.Lock()
defer verMgr.Unlock()
for i, ver := range verMgr.multiVersionList {
if ver.Ver == verSeq {
if ver.Status != proto.VersionDeleting && ver.Status != proto.VersionDeleteAbnormal {
err = fmt.Errorf("with seq %v but it's status is %v", verSeq, ver.Status)
log.LogErrorf("action[VolVersionManager.DelVer] vol %v err %v", verMgr.vol.Name, err)
return
}
verMgr.multiVersionList = append(verMgr.multiVersionList[:i], verMgr.multiVersionList[i+1:]...)
break
}
}
if err = verMgr.Persist(); err != nil {
log.LogErrorf("[DelVer] vol %v call persist error %v", verMgr.vol.Name, err)
}
return
}
func (verMgr *VolVersionManager) SetVerStrategy(strategy proto.VolumeVerStrategy, isForce bool) (err error) {
verMgr.Lock()
defer verMgr.Unlock()
log.LogWarnf("vol %v SetVerStrategy.keepCnt %v need in [1-%v], peroidic %v need in [1-%v], enable %v", verMgr.vol.Name,
strategy.KeepVerCnt, MaxSnapshotCount, strategy.GetPeriodic(), 24*7, strategy.Enable)
if strategy.Enable == true {
if strategy.KeepVerCnt > MaxSnapshotCount || strategy.GetPeriodic() > 24*7 || strategy.KeepVerCnt < 0 || strategy.GetPeriodic() < 0 {
return fmt.Errorf("SetVerStrategy.vol %v keepCnt %v need in [1-%v], peroidic %v need in [1-%v] not qualified",
verMgr.vol.Name, strategy.KeepVerCnt, MaxSnapshotCount, strategy.GetPeriodic(), 24*7)
}
if strategy.KeepVerCnt != 0 {
verMgr.strategy.KeepVerCnt = strategy.KeepVerCnt
}
if strategy.GetPeriodic() != 0 {
verMgr.strategy.Periodic = strategy.Periodic
}
if isForce {
verMgr.strategy.ForceUpdate = strategy.ForceUpdate
}
}
verMgr.strategy.Enable = strategy.Enable
verMgr.strategy.UTime = time.Now()
if err = verMgr.Persist(); err != nil {
log.LogErrorf("action[SetVerStrategy] vol %v err %v", verMgr.vol.Name, err)
return
}
return
}
func (verMgr *VolVersionManager) checkCreateStrategy(c *Cluster) {
verMgr.RLock()
log.LogDebugf("checkSnapshotStrategy enter")
if len(verMgr.multiVersionList)-1 > verMgr.strategy.KeepVerCnt {
verMgr.RUnlock()
return
}
verMgr.RUnlock()
curTime := time.Now()
if verMgr.strategy.TimeUp(curTime) {
log.LogDebugf("checkSnapshotStrategy.vol %v try create snapshot", verMgr.vol.Name)
if _, err := verMgr.createVer2PhaseTask(c, uint64(time.Now().UnixMicro()), proto.CreateVersion, verMgr.strategy.ForceUpdate); err != nil {
verMgr.RLock()
verEle := verMgr.multiVersionList[len(verMgr.multiVersionList)-1]
verMgr.RUnlock()
if int64(verEle.Ver)/1e6+int64(verMgr.strategy.GetPeriodicSecond()) < curTime.Unix() {
msg := fmt.Sprintf("[checkSnapshotStrategy] last version %v status %v for %v hours than 2times periodic", verEle.Ver, verEle.Status, 2*verMgr.strategy.Periodic)
Warn(c.Name, msg)
}
return
}
verMgr.strategy.UTime = time.Now()
if err := verMgr.Persist(); err != nil {
log.LogErrorf("vol %v call persist error %v", verMgr.vol.Name, err)
}
}
}
func (verMgr *VolVersionManager) checkDeleteStrategy(c *Cluster) {
verMgr.RLock()
log.LogDebugf("checkSnapshotStrategy.vol %v try delete snapshot nLen %v, keep cnt %v", verMgr.vol.Name, len(verMgr.multiVersionList)-1, verMgr.strategy.KeepVerCnt)
nLen := len(verMgr.multiVersionList)
log.LogDebugf("checkSnapshotStrategy.vol %v try delete snapshot nLen %v, keep cnt %v", verMgr.vol.Name, len(verMgr.multiVersionList)-1, verMgr.strategy.KeepVerCnt)
if nLen-1 > verMgr.strategy.KeepVerCnt {
log.LogDebugf("checkSnapshotStrategy.vol %v try delete snapshot nLen %v, keep cnt %v", verMgr.vol.Name, nLen-1, verMgr.strategy.KeepVerCnt)
if verMgr.multiVersionList[0].Status != proto.VersionNormal {
log.LogDebugf("checkSnapshotStrategy.vol %v oldest ver %v status %v",
verMgr.vol.Name, verMgr.multiVersionList[0].Ver, verMgr.multiVersionList[0].Status)
if verMgr.multiVersionList[0].DelTime+int64(verMgr.strategy.GetPeriodicSecond()) < time.Now().Unix() {
msg := fmt.Sprintf("[checkSnapshotStrategy] version %v in deleting status for %v hours than configure periodic [%v] hours",
verMgr.multiVersionList[0].Ver, verMgr.multiVersionList[0].Status, verMgr.strategy.GetPeriodic())
Warn(c.Name, msg)
}
verMgr.RUnlock()
return
}
verMgr.RUnlock()
if _, err := verMgr.createVer2PhaseTask(c, verMgr.multiVersionList[0].Ver, proto.DeleteVersion, verMgr.strategy.ForceUpdate); err != nil {
return
}
return
}
verMgr.RUnlock()
}
func (verMgr *VolVersionManager) UpdateVerStatus(verSeq uint64, status uint8) (err error) {
verMgr.Lock()
defer verMgr.Unlock()
for _, ver := range verMgr.multiVersionList {
if ver.Ver == verSeq {
ver.Status = status
}
if ver.Ver > verSeq {
return fmt.Errorf("not found")
}
}
return
}
const (
TypeNoReply = 0
TypeReply = 1
MaxSnapshotCount = 30
)
func (verMgr *VolVersionManager) handleTaskRsp(resp *proto.MultiVersionOpResponse, partitionType uint32) {
verMgr.RLock()
defer verMgr.RUnlock()
log.LogInfof("action[handleTaskRsp] vol %v node %v partitionType %v,op %v, inner op %v", verMgr.vol.Name,
resp.Addr, partitionType, resp.Op, verMgr.prepareCommit.op)
if resp.Op != verMgr.prepareCommit.op {
log.LogWarnf("action[handleTaskRsp] vol %v op %v, inner op %v", verMgr.vol.Name, resp.Op, verMgr.prepareCommit.op)
return
}
if resp.Op != proto.DeleteVersion && resp.VerSeq != verMgr.prepareCommit.prepareInfo.Ver {
log.LogErrorf("action[handleTaskRsp] vol %v op %v, inner verseq %v commit verseq %v", verMgr.vol.Name,
resp.Op, resp.VerSeq, verMgr.prepareCommit.prepareInfo.Ver)
return
}
var needCommit bool
dFunc := func(pType uint32, array *sync.Map) {
if val, ok := array.Load(resp.Addr); ok {
if rType, rok := val.(int); rok && rType == TypeNoReply {
log.LogInfof("action[handleTaskRsp] vol %v node %v partitionType %v,op %v, inner op %v", verMgr.vol.Name,
resp.Addr, partitionType, resp.Op, verMgr.prepareCommit.op)
array.Store(resp.Addr, TypeReply)
if resp.Status != proto.TaskSucceeds || resp.Result != "" {
log.LogErrorf("action[handleTaskRsp] vol %v type %v node %v rsp sucess. op %v, verseq %v,commit cnt %v, rsp status %v mgr status %v result %v",
verMgr.vol.Name, pType, resp.Addr, resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt), resp.Status, verMgr.status, resp.Result)
if verMgr.prepareCommit.prepareInfo.Status == proto.VersionWorking {
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
verMgr.wait <- fmt.Errorf("pType %v node %v error %v", pType, resp.Addr, resp.Status)
log.LogErrorf("action[handleTaskRsp] vol %v type %v commit cnt %v, rsp status %v mgr status %v result %v", verMgr.vol.Name,
pType, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt), resp.Status, verMgr.status, resp.Result)
return
}
return
}
if verMgr.prepareCommit.nodeCnt == atomic.AddUint32(&verMgr.prepareCommit.commitCnt, 1) {
needCommit = true
}
log.LogDebugf("action[handleTaskRsp] vol %v type %v node %v rsp sucess. op %v, verseq %v,commit cnt %v", verMgr.vol.Name,
pType, resp.Addr, resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt))
} else {
log.LogWarnf("action[handleTaskRsp] vol %v type %v node %v op %v, inner verseq %v commit verseq %v status %v", verMgr.vol.Name,
pType, resp.Addr, resp.Op, resp.VerSeq, verMgr.prepareCommit.prepareInfo.Ver, val.(int))
}
} else {
log.LogErrorf("action[handleTaskRsp] vol %v type %v node %v not found. op %v, inner verseq %v commit verseq %v", verMgr.vol.Name,
pType, resp.Addr, resp.Op, resp.VerSeq, verMgr.prepareCommit.prepareInfo.Ver)
}
}
if partitionType == TypeDataPartition {
dFunc(partitionType, verMgr.prepareCommit.dataNodeArray)
} else {
dFunc(partitionType, verMgr.prepareCommit.metaNodeArray)
}
log.LogInfof("action[handleTaskRsp] vol %v commit cnt %v, node cnt %v, operation %v", verMgr.vol.Name,
atomic.LoadUint32(&verMgr.prepareCommit.commitCnt),
atomic.LoadUint32(&verMgr.prepareCommit.nodeCnt), verMgr.prepareCommit.op)
if atomic.LoadUint32(&verMgr.prepareCommit.commitCnt) == verMgr.prepareCommit.nodeCnt && needCommit {
if verMgr.prepareCommit.op == proto.DeleteVersion {
verMgr.CommitVer()
// verMgr.prepareCommit.reset()
// verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingFinished
log.LogWarnf("action[handleTaskRsp] vol %v do Del version finished, verMgr %v", verMgr.vol.Name, verMgr)
} else if verMgr.prepareCommit.op == proto.CreateVersionPrepare {
log.LogInfof("action[handleTaskRsp] vol %v ver update prepare sucess. op %v, verseq %v,commit cnt %v", verMgr.vol.Name,
resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt))
verMgr.CommitVer()
} else if verMgr.prepareCommit.op == proto.CreateVersionCommit {
log.LogWarnf("action[handleTaskRsp] vol %v ver already update all node now! op %v, verseq %v,commit cnt %v", verMgr.vol.Name,
resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt))
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingFinished
verMgr.wait <- nil
}
}
}
func (verMgr *VolVersionManager) createTaskToDataNode(cluster *Cluster, verSeq uint64, op uint8, force bool) (err error) {
var dpHost sync.Map
log.LogWarnf("action[createTaskToDataNode] vol %v verMgr.status %v verSeq %v op %v force %v, prepareCommit.nodeCnt %v",
verMgr.vol.Name, verMgr.status, verSeq, op, force, verMgr.prepareCommit.nodeCnt)
for _, dp := range verMgr.vol.dataPartitions.clonePartitions() {
for _, host := range dp.Hosts {
dpHost.Store(host, nil)
}
dp.VerSeq = verSeq
}
tasks := make([]*proto.AdminTask, 0)
cluster.dataNodes.Range(func(addr, dataNode interface{}) bool {
if _, ok := dpHost.Load(addr); !ok {
return true
}
node := dataNode.(*DataNode)
node.checkLiveness()
if !node.isActive {
if !force {
err = fmt.Errorf("node %v not alive", node.Addr)
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
return false
}
atomic.AddUint32(&verMgr.prepareCommit.commitCnt, 1)
log.LogInfof("action[createTaskToDataNode] volume %v addr %v op %v verseq %v force commit in advance", verMgr.vol.Name, addr.(string), op, verSeq)
}
verMgr.prepareCommit.dataNodeArray.Store(node.Addr, TypeNoReply)
verMgr.prepareCommit.nodeCnt++
log.LogInfof("action[createTaskToDataNode] volume %v addr %v op %v verseq %v nodeCnt %v",
verMgr.vol.Name, addr.(string), op, verSeq, verMgr.prepareCommit.nodeCnt)
task := node.createVersionTask(verMgr.vol.Name, verSeq, op, addr.(string), verMgr.multiVersionList)
tasks = append(tasks, task)
return true
})
if verMgr.prepareCommit.prepareInfo.Status != proto.VersionWorking {
log.LogWarnf("action[verManager.createTask] vol %v status %v not working", verMgr.vol.Name, verMgr.status)
return
}
log.LogInfof("action[verManager.createTask] verSeq %v, datanode task cnt %v", verSeq, len(tasks))
cluster.addDataNodeTasks(tasks)
return
}
func (verMgr *VolVersionManager) createTaskToMetaNode(cluster *Cluster, verSeq uint64, op uint8, force bool) (err error) {
var (
mpHost sync.Map
ok bool
)
log.LogInfof("action[verManager.createTaskToMetaNode] vol %v verSeq %v, mp cnt %v, prepareCommit.nodeCnt %v",
verMgr.vol.Name, verSeq, len(verMgr.vol.MetaPartitions), verMgr.prepareCommit.nodeCnt)
verMgr.vol.mpsLock.RLock()
for _, mp := range verMgr.vol.MetaPartitions {
for _, host := range mp.Hosts {
mpHost.Store(host, nil)
}
mp.VerSeq = verSeq
}
verMgr.vol.mpsLock.RUnlock()
tasks := make([]*proto.AdminTask, 0)
cluster.metaNodes.Range(func(addr, metaNode interface{}) bool {
if _, ok = mpHost.Load(addr); !ok {
return true
}
node := metaNode.(*MetaNode)
if !node.IsActive {
if !force {
err = fmt.Errorf("node %v not alive", node.Addr)
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
return false
}
atomic.AddUint32(&verMgr.prepareCommit.commitCnt, 1)
}
verMgr.prepareCommit.nodeCnt++
log.LogInfof("action[createTaskToMetaNode] volume %v addr %v op %v verseq %v nodeCnt %v",
verMgr.vol.Name, addr.(string), op, verSeq, verMgr.prepareCommit.nodeCnt)
verMgr.prepareCommit.metaNodeArray.Store(node.Addr, TypeNoReply)
task := node.createVersionTask(verMgr.vol.Name, verSeq, op, addr.(string), verMgr.multiVersionList)
tasks = append(tasks, task)
return true
})
if verMgr.prepareCommit.prepareInfo.Status != proto.VersionWorking {
return
}
log.LogInfof("action[verManager.createTaskToMetaNode] vol %v verSeq %v, metaNodes task cnt %v", verMgr.vol.Name, verSeq, len(tasks))
cluster.addMetaNodeTasks(tasks)
return
}
func (verMgr *VolVersionManager) finishWork() {
log.LogDebugf("action[finishWork] vol %v VolVersionManager finishWork!", verMgr.vol.Name)
atomic.StoreUint32(&verMgr.status, proto.VersionWorkingFinished)
}
func (verMgr *VolVersionManager) startWork() (err error) {
var status uint32
log.LogDebugf("action[VolVersionManager.startWork] vol %v status %v", verMgr.status, verMgr.vol.Name)
if status = atomic.LoadUint32(&verMgr.status); status == proto.VersionWorking {
err = fmt.Errorf("have task still working,try it later")
log.LogWarnf("action[VolVersionManager.startWork] vol %v %v", verMgr.vol.Name, err)
return
}
if !atomic.CompareAndSwapUint32(&verMgr.status, status, proto.VersionWorking) {
err = fmt.Errorf("have task still working,try it later")
log.LogWarnf("action[VolVersionManager.startWork] vol %v %v", verMgr.vol.Name, err)
return
}
return
}
func (verMgr *VolVersionManager) getLayInfo(verSeq uint64) (int, bool) {
for idx, info := range verMgr.multiVersionList {
if info.Ver == verSeq {
return idx, true
}
}
return 0, false
}
func (verMgr *VolVersionManager) createTask(cluster *Cluster, verSeq uint64, op uint8, force bool) (ver *proto.VolVersionInfo, err error) {
log.LogInfof("action[VolVersionManager.createTask] vol %v verSeq %v op %v force %v ,prepareCommit.nodeCnt %v",
verMgr.vol.Name, verSeq, op, force, verMgr.prepareCommit.nodeCnt)
verMgr.RLock()
defer verMgr.RUnlock()
if err = verMgr.createTaskToDataNode(cluster, verSeq, op, force); err != nil {
log.LogInfof("action[VolVersionManager.createTask] vol %v err %v", verMgr.vol.Name, err)
return
}
if err = verMgr.createTaskToMetaNode(cluster, verSeq, op, force); err != nil {
log.LogInfof("action[VolVersionManager.createTask] vol %v err %v", verMgr.vol.Name, err)
return
}
log.LogInfof("action[VolVersionManager.createTask] exit")
return
}
func (verMgr *VolVersionManager) initVer2PhaseTask(verSeq uint64, op uint8) (verRsp *proto.VolVersionInfo, err error, opRes uint8) {
verMgr.prepareCommit.reset(verMgr.vol.Name)
log.LogWarnf("action[VolVersionManager.initVer2PhaseTask] vol %v verMgr.status %v op %v verSeq %v", verMgr.vol.Name, verMgr.status, op, verSeq)
if op == proto.CreateVersion {
if err = verMgr.GenerateVer(verSeq, op); err != nil {
log.LogInfof("action[VolVersionManager.initVer2PhaseTask] exit")
return
}
op = proto.CreateVersionPrepare
log.LogInfof("action[VolVersionManager.initVer2PhaseTask] CreateVersionPrepare")
} else if op == proto.DeleteVersion {
var (
idx int
found bool
)
if verMgr.enableMiddleOp {
if ver, status := verMgr.getOldestVer(); ver != verSeq || status != proto.VersionNormal {
err = fmt.Errorf("oldest is %v, status %v", ver, status)
return
}
}
if idx, found = verMgr.getLayInfo(verSeq); !found {
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v not found", verMgr.vol.Name, op, verSeq)
return nil, fmt.Errorf("not found"), op
}
if idx == len(verMgr.multiVersionList)-1 {
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v is uncommitted", verMgr.vol.Name, op, verSeq)
return nil, fmt.Errorf("uncommited version"), op
}
if verMgr.multiVersionList[idx].Status == proto.VersionDeleting {
log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v is uncommitted", verMgr.vol.Name, op, verSeq)
return nil, fmt.Errorf("version on deleting"), op
}
if verMgr.multiVersionList[idx].Status == proto.VersionDeleted {
log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v is uncommitted", verMgr.vol.Name, op, verSeq)
return nil, fmt.Errorf("version alreay be deleted"), op
}
verMgr.prepareCommit.op = op
verMgr.prepareCommit.prepareInfo = &proto.VolVersionInfo{
Ver: verSeq,
Status: proto.VersionWorking,
}
}
opRes = op
return
}
func (verMgr *VolVersionManager) createVer2PhaseTask(cluster *Cluster, verSeq uint64, op uint8, force bool) (verRsp *proto.VolVersionInfo, err error) {
if err = verMgr.startWork(); err != nil {
return
}
if !proto.IsHot(verMgr.vol.VolType) {
err = fmt.Errorf("vol need be hot one")
log.LogErrorf("vol %v createVer2PhaseTask. %v", verMgr.vol.Name, err)
return
}
defer func() {
if err != nil {
log.LogWarnf("action[createVer2PhaseTask] vol %v close lock due to err %v", verMgr.vol.Name, err)
verMgr.finishWork()
}
}()
if verRsp, err, op = verMgr.initVer2PhaseTask(verSeq, op); err != nil {
return
}
if op == proto.CreateVersion {
log.LogWarnf("action[createVer2PhaseTask] vol %v update seq %v to %v", verMgr.vol.Name, verSeq, verMgr.prepareCommit.prepareInfo.Ver)
verSeq = verMgr.prepareCommit.prepareInfo.Ver
}
if _, err = verMgr.createTask(cluster, verSeq, op, force); err != nil {
log.LogInfof("action[createVer2PhaseTask] vol %v CreateVersionPrepare err %v", verMgr.vol.Name, err)
return
}
verMgr.prepareCommit.op = op
wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
wgFin := false
wgDone := func() {
if !wgFin {
wg.Done()
wgFin = true
}
}
log.LogInfof("action[createVer2PhaseTask] verseq %v op %v enter wait schedule", verSeq, verMgr.prepareCommit.op)
defer func() {
log.LogDebugf("action[createVer2PhaseTask] status %v", verMgr.status)
log.LogInfof("action[createVer2PhaseTask] verseq %v op %v exit wait schedule", verSeq, verMgr.prepareCommit.op)
if err != nil {
log.LogInfof("action[createVer2PhaseTask] verseq %v op %v exit schedule with err %v", verSeq, verMgr.prepareCommit.op, err)
}
wgDone()
}()
ticker := time.NewTicker(time.Second)
cnt := 0
for {
select {
case err = <-verMgr.wait:
log.LogInfof("action[createVer2PhaseTask] %v go routine verseq %v op %v get err %v", verMgr.vol.Name, verSeq, verMgr.prepareCommit.op, err)
if verMgr.prepareCommit.op == proto.DeleteVersion {
if err == nil {
verMgr.prepareCommit.reset(verMgr.vol.Name)
if err = verMgr.Persist(); err != nil {
log.LogErrorf("action[createVer2PhaseTask] vol %v err %v", verMgr.vol.Name, err)
return
}
verMgr.finishWork()
wgDone()
} else {
verMgr.prepareCommit.reset(verMgr.vol.Name)
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
log.LogInfof("action[createVer2PhaseTask] vol %v prepare error %v", verMgr.vol.Name, err)
}
return
} else if verMgr.prepareCommit.op == proto.CreateVersionPrepare {
if err == nil {
verMgr.verSeq = verSeq
verMgr.prepareCommit.reset(verMgr.vol.Name)
verMgr.prepareCommit.op = proto.CreateVersionCommit
if err = verMgr.Persist(); err != nil {
log.LogErrorf("action[createVer2PhaseTask] vol %v err %v", verMgr.vol.Name, err)
return
}
log.LogInfof("action[createVer2PhaseTask] vol %v prepare fin.start commit", verMgr.vol.Name)
if _, err = verMgr.createTask(cluster, verSeq, verMgr.prepareCommit.op, force); err != nil {
log.LogInfof("action[createVer2PhaseTask] vol %v prepare error %v", verMgr.vol.Name, err)
return
}
if vLen := len(verMgr.multiVersionList); vLen > 1 {
verRsp = verMgr.multiVersionList[vLen-2]
}
wgDone()
} else {
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
log.LogInfof("action[createVer2PhaseTask] vol %v prepare error %v", verMgr.vol.Name, err)
return
}
} else if verMgr.prepareCommit.op == proto.CreateVersionCommit {
log.LogInfof("action[createVer2PhaseTask] vol %v create ver task commit, create 2phase finished", verMgr.vol.Name)
verMgr.prepareCommit.reset(verMgr.vol.Name)
verMgr.finishWork()
return
} else {
log.LogErrorf("action[createVer2PhaseTask] vol %v op %v", verMgr.vol.Name, verMgr.prepareCommit.op)
return
}
case <-verMgr.cancel:
verMgr.prepareCommit.reset(verMgr.vol.Name)
log.LogInfof("action[createVer2PhaseTask.cancel] vol %v verseq %v op %v be canceled", verMgr.vol.Name, verSeq, verMgr.prepareCommit.op)
return
case <-ticker.C:
log.LogInfof("action[createVer2PhaseTask.tick] vol %v verseq %v op %v wait", verMgr.vol.Name, verSeq, verMgr.prepareCommit.op)
cnt++
if cnt > 5 {
verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingTimeOut
err = fmt.Errorf("verseq %v op %v be set timeout", verSeq, verMgr.prepareCommit.op)
log.LogInfof("action[createVer2PhaseTask] vol %v close lock due to err %v", verMgr.vol.Name, err)
if verMgr.prepareCommit.op == proto.CreateVersionCommit {
err = nil
}
verMgr.prepareCommit.reset(verMgr.vol.Name)
verMgr.finishWork()
return
}
}
}
}()
wg.Wait()
log.LogDebugf("action[createVer2PhaseTask] vol %v prepare phase finished", verMgr.vol.Name)
return
}
func (verMgr *VolVersionManager) init(cluster *Cluster) error {
verMgr.c = cluster
log.LogWarnf("action[VolVersionManager.init] vol %v", verMgr.vol.Name)
verMgr.multiVersionList = append(verMgr.multiVersionList, &proto.VolVersionInfo{
Ver: 0,
Status: 1,
})
if cluster.partition.IsRaftLeader() {
return verMgr.Persist()
}
return nil
}
func (verMgr *VolVersionManager) getVersionInfo(verGet uint64) (verInfo *proto.VolVersionInfo, err error) {
verMgr.RLock()
defer verMgr.RUnlock()
if !proto.IsHot(verMgr.vol.VolType) {
err = fmt.Errorf("vol need be hot one")
log.LogErrorf("createVer2PhaseTask. %v", err)
return
}
log.LogDebugf("action[getVersionInfo] verGet %v", verGet)
for _, ver := range verMgr.multiVersionList {
if ver.Ver == verGet {
log.LogDebugf("action[getVersionInfo] ver %v", ver)
return ver, nil
}
log.LogDebugf("action[getVersionInfo] ver %v", ver)
if ver.Ver > verGet {
log.LogDebugf("action[getVersionInfo] ver %v", ver)
break
}
}
msg := fmt.Sprintf("ver [%v] not found", verGet)
log.LogInfof("action[getVersionInfo] %v", msg)
return nil, fmt.Errorf("%v", msg)
}
func (verMgr *VolVersionManager) getOldestVer() (ver uint64, status uint8) {
verMgr.RLock()
defer verMgr.RUnlock()
size := len(verMgr.multiVersionList)
if size <= 1 {
return 0, proto.VersionDeleteAbnormal
}
log.LogInfof("action[getLatestVer] ver len %v verMgr %v", size, verMgr)
return verMgr.multiVersionList[0].Ver, verMgr.multiVersionList[0].Status
}
func (verMgr *VolVersionManager) getVolDelStatus() (status uint8) {
verMgr.RLock()
defer verMgr.RUnlock()
size := len(verMgr.multiVersionList)
if size == 0 {
return 0
}
log.LogInfof("action[getLatestVer] ver len %v verMgr %v", size, verMgr)
return verMgr.multiVersionList[size-1].Status
}
func (verMgr *VolVersionManager) getLatestVer() (ver uint64) {
verMgr.RLock()
defer verMgr.RUnlock()
size := len(verMgr.multiVersionList)
if size == 0 {
return 0
}
return verMgr.multiVersionList[size-1].Ver
}
func (verMgr *VolVersionManager) getVersionList() *proto.VolVersionInfoList {
verMgr.RLock()
defer verMgr.RUnlock()
return &proto.VolVersionInfoList{
VerList: verMgr.multiVersionList,
Strategy: verMgr.strategy,
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"math"
"math/rand"
"sort"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
const RoundRobinNodeSelectorName = "RoundRobin"
const CarryWeightNodeSelectorName = "CarryWeight"
const AvailableSpaceFirstNodeSelectorName = "AvailableSpaceFirst"
const StrawNodeSelectorName = "Straw"
const DefaultNodeSelectorName = CarryWeightNodeSelectorName
func (ns *nodeSet) getNodes(nodeType NodeType) *sync.Map {
switch nodeType {
case DataNodeType:
return ns.dataNodes
case MetaNodeType:
return ns.metaNodes
default:
panic("unknown node type")
}
}
type NodeSelector interface {
GetName() string
Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error)
}
type weightedNode struct {
Carry float64
Weight float64
Ptr Node
ID uint64
}
// Node defines an interface that needs to be implemented by weightedNode
type Node interface {
SelectNodeForWrite()
GetID() uint64
GetAddr() string
}
// SortedWeightedNodes defines an array sorted by carry
type SortedWeightedNodes []*weightedNode
func (nodes SortedWeightedNodes) Len() int {
return len(nodes)
}
func (nodes SortedWeightedNodes) Less(i, j int) bool {
return nodes[i].Carry > nodes[j].Carry
}
func (nodes SortedWeightedNodes) Swap(i, j int) {
nodes[i], nodes[j] = nodes[j], nodes[i]
}
func canAllocPartition(node interface{}, nodeType NodeType) bool {
switch nodeType {
case DataNodeType:
dataNode := node.(*DataNode)
return dataNode.canAlloc() && dataNode.canAllocDp()
case MetaNodeType:
metaNode := node.(*MetaNode)
return metaNode.isWritable()
default:
panic("unknown node type")
}
}
func asNodeWrap(node interface{}, nodeType NodeType) Node {
switch nodeType {
case DataNodeType:
dataNode := node.(*DataNode)
return dataNode
case MetaNodeType:
metaNode := node.(*MetaNode)
return metaNode
default:
panic("unknown node type")
}
}
type CarryWeightNodeSelector struct {
nodeType NodeType
carry map[uint64]float64
}
func (s *CarryWeightNodeSelector) GetName() string {
return CarryWeightNodeSelectorName
}
func (s *CarryWeightNodeSelector) prepareCarryForDataNodes(nodes *sync.Map, total uint64) {
nodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if _, ok := s.carry[dataNode.ID]; !ok {
// use available space to calculate initial weight
s.carry[dataNode.ID] = float64(dataNode.AvailableSpace) / float64(total)
}
return true
})
}
func (s *CarryWeightNodeSelector) prepareCarryForMetaNodes(nodes *sync.Map, total uint64) {
nodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
if _, ok := s.carry[metaNode.ID]; !ok {
// use available space to calculate initial weight
s.carry[metaNode.ID] = float64(metaNode.Total-metaNode.Used) / float64(total)
}
return true
})
}
func (s *CarryWeightNodeSelector) prepareCarry(nodes *sync.Map, total uint64) {
switch s.nodeType {
case DataNodeType:
s.prepareCarryForDataNodes(nodes, total)
case MetaNodeType:
s.prepareCarryForMetaNodes(nodes, total)
default:
}
}
func (s *CarryWeightNodeSelector) getTotalMaxForDataNodes(nodes *sync.Map) (total uint64) {
nodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if dataNode.Total > total {
total = dataNode.Total
}
return true
})
return
}
func (s *CarryWeightNodeSelector) getTotalMaxForMetaNodes(nodes *sync.Map) (total uint64) {
nodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
if metaNode.Total > total {
total = metaNode.Total
}
return true
})
return
}
func (s *CarryWeightNodeSelector) getTotalMax(nodes *sync.Map) (total uint64) {
switch s.nodeType {
case DataNodeType:
total = s.getTotalMaxForDataNodes(nodes)
case MetaNodeType:
total = s.getTotalMaxForMetaNodes(nodes)
default:
}
return
}
func (s *CarryWeightNodeSelector) getCarryDataNodes(maxTotal uint64, excludeHosts []string, dataNodes *sync.Map) (nodeTabs SortedWeightedNodes, availCount int) {
nodeTabs = make(SortedWeightedNodes, 0)
dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if contains(excludeHosts, dataNode.Addr) {
// log.LogDebugf("[getAvailCarryDataNodeTab] dataNode [%v] is excludeHosts", dataNode.Addr)
return true
}
if !dataNode.canAllocDp() {
log.LogDebugf("[getAvailCarryDataNodeTab] dataNode [%v] is not writeable, offline %v, dpCnt %d",
dataNode.Addr, dataNode.ToBeOffline, dataNode.DataPartitionCount)
return true
}
if !dataNode.canAlloc() {
log.LogWarnf("[getAvailCarryDataNodeTab] dataNode [%v] is overSold", dataNode.Addr)
return true
}
if s.carry[dataNode.ID] >= 1.0 {
availCount++
}
nt := new(weightedNode)
nt.Carry = s.carry[dataNode.ID]
nt.Weight = float64(dataNode.AvailableSpace) / float64(maxTotal)
nt.Ptr = dataNode
nodeTabs = append(nodeTabs, nt)
return true
})
return
}
func (s *CarryWeightNodeSelector) getCarryMetaNodes(maxTotal uint64, excludeHosts []string, metaNodes *sync.Map) (nodes SortedWeightedNodes, availCount int) {
nodes = make(SortedWeightedNodes, 0)
metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
if contains(excludeHosts, metaNode.Addr) {
return true
}
if !metaNode.isWritable() {
return true
}
if s.carry[metaNode.ID] >= 1.0 {
availCount++
}
nt := new(weightedNode)
nt.Carry = s.carry[metaNode.ID]
nt.Weight = (float64)(metaNode.Total-metaNode.Used) / (float64)(maxTotal)
nt.Ptr = metaNode
nodes = append(nodes, nt)
return true
})
return
}
func (s *CarryWeightNodeSelector) getCarryNodes(nset *nodeSet, maxTotal uint64, excludeHosts []string) (SortedWeightedNodes, int) {
switch s.nodeType {
case DataNodeType:
return s.getCarryDataNodes(maxTotal, excludeHosts, nset.dataNodes)
case MetaNodeType:
return s.getCarryMetaNodes(maxTotal, excludeHosts, nset.metaNodes)
default:
panic("unknown node type")
}
}
func (s *CarryWeightNodeSelector) setNodeCarry(nodes SortedWeightedNodes, availCarryCount, replicaNum int) {
for availCarryCount < replicaNum {
availCarryCount = 0
for _, nt := range nodes {
carry := nt.Carry + nt.Weight
// limit the max value of weight
// prevent subsequent selections make node overloading
if carry > 10.0 {
carry = 10.0
}
nt.Carry = carry
s.carry[nt.Ptr.GetID()] = carry
if carry > 1.0 {
availCarryCount++
}
}
}
}
func (s *CarryWeightNodeSelector) selectNodeForWrite(node Node) {
node.SelectNodeForWrite()
// decrease node weight
s.carry[node.GetID()] -= 1.0
}
func (s *CarryWeightNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
nodes := ns.getNodes(s.nodeType)
total := s.getTotalMax(nodes)
// prepare carry for every nodes
s.prepareCarry(nodes, total)
orderHosts := make([]string, 0)
newHosts = make([]string, 0)
peers = make([]proto.Peer, 0)
// if replica == 0, return
if replicaNum == 0 {
return
}
// if we cannot get enough writable nodes, return error
weightedNodes, count := s.getCarryNodes(ns, total, excludeHosts)
if len(weightedNodes) < replicaNum {
err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v MatchNodeCount:%v ",
s.GetName(), replicaNum, len(weightedNodes))
return
}
// create enough carry nodes
// we say a node is "carry node", whent its carry >= 1.0
s.setNodeCarry(weightedNodes, count, replicaNum)
// sort nodes by weight
sort.Sort(weightedNodes)
// pick first N nodes
for i := 0; i < replicaNum; i++ {
node := weightedNodes[i].Ptr
s.selectNodeForWrite(node)
orderHosts = append(orderHosts, node.GetAddr())
peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
peers = append(peers, peer)
}
log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
// reshuffle for primary-backup replication
if newHosts, err = reshuffleHosts(orderHosts); err != nil {
err = fmt.Errorf("action[%vNodeSelector::Select] err:%v orderHosts is nil", s.GetName(), err.Error())
return
}
return
}
func NewCarryWeightNodeSelector(nodeType NodeType) *CarryWeightNodeSelector {
return &CarryWeightNodeSelector{
carry: make(map[uint64]float64),
nodeType: nodeType,
}
}
type AvailableSpaceFirstNodeSelector struct {
nodeType NodeType
}
func (s *AvailableSpaceFirstNodeSelector) getNodeAvailableSpace(node interface{}) uint64 {
switch s.nodeType {
case DataNodeType:
dataNode := node.(*DataNode)
return dataNode.AvailableSpace
case MetaNodeType:
metaNode := node.(*MetaNode)
return metaNode.Total - metaNode.Used
default:
panic("unkown node type")
}
}
func (s *AvailableSpaceFirstNodeSelector) GetName() string {
return AvailableSpaceFirstNodeSelectorName
}
func (s *AvailableSpaceFirstNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
newHosts = make([]string, 0)
peers = make([]proto.Peer, 0)
// if replica == 0, return
if replicaNum == 0 {
return
}
orderHosts := make([]string, 0)
nodes := ns.getNodes(s.nodeType)
sortedNodes := make([]Node, 0)
nodes.Range(func(key, value interface{}) bool {
sortedNodes = append(sortedNodes, asNodeWrap(value, s.nodeType))
return true
})
// if we cannot get enough nodes, return error
if len(sortedNodes) < replicaNum {
err = fmt.Errorf("action[%vNodeSelector::Select] no enough hosts,replicaNum:%v MatchNodeCount:%v ",
s.GetName(), replicaNum, len(sortedNodes))
return
}
// sort nodes by available space
sort.Slice(sortedNodes, func(i, j int) bool {
return s.getNodeAvailableSpace(sortedNodes[i]) > s.getNodeAvailableSpace(sortedNodes[j])
})
nodeIndex := 0
// pick first N nodes
for i := 0; i < replicaNum && nodeIndex < len(sortedNodes); i++ {
selectedIndex := len(sortedNodes)
// loop until we get a writable node
for nodeIndex < len(sortedNodes) {
node := sortedNodes[nodeIndex]
nodeIndex += 1
if canAllocPartition(node, s.nodeType) {
if excludeHosts == nil || !contains(excludeHosts, node.GetAddr()) {
selectedIndex = nodeIndex - 1
break
}
}
}
// if we get a writable node, append it to host list
if selectedIndex != len(sortedNodes) {
node := sortedNodes[selectedIndex]
node.SelectNodeForWrite()
orderHosts = append(orderHosts, node.GetAddr())
peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
peers = append(peers, peer)
}
}
// if we cannot get enough writable nodes, return error
if len(orderHosts) < replicaNum {
err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v MatchNodeCount:%v ",
s.GetName(), replicaNum, len(orderHosts))
return
}
log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
// reshuffle for primary-backup replication
if newHosts, err = reshuffleHosts(orderHosts); err != nil {
err = fmt.Errorf("action[%vNodeSelector::Select] err:%v orderHosts is nil", s.GetName(), err.Error())
return
}
return
}
func NewAvailableSpaceFirstNodeSelector(nodeType NodeType) *AvailableSpaceFirstNodeSelector {
return &AvailableSpaceFirstNodeSelector{
nodeType: nodeType,
}
}
type RoundRobinNodeSelector struct {
index int
nodeType NodeType
}
func (s *RoundRobinNodeSelector) GetName() string {
return RoundRobinNodeSelectorName
}
func (s *RoundRobinNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
newHosts = make([]string, 0)
peers = make([]proto.Peer, 0)
// if replica == 0, return
if replicaNum == 0 {
return
}
orderHosts := make([]string, 0)
nodes := ns.getNodes(s.nodeType)
sortedNodes := make([]Node, 0)
nodes.Range(func(key, value interface{}) bool {
sortedNodes = append(sortedNodes, asNodeWrap(value, s.nodeType))
return true
})
// if we cannot get enough nodes, return error
if len(sortedNodes) < replicaNum {
err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v MatchNodeCount:%v ",
s.GetName(), replicaNum, len(sortedNodes))
return
}
// sort nodes by id, so we can get a node list that is as stable as possible
sort.Slice(sortedNodes, func(i, j int) bool {
return sortedNodes[i].GetID() < sortedNodes[j].GetID()
})
nodeIndex := 0
// pick first N nodes
for i := 0; i < replicaNum && nodeIndex < len(sortedNodes); i++ {
selectedIndex := len(sortedNodes)
// loop until we get a writable node
for nodeIndex < len(sortedNodes) {
node := sortedNodes[(nodeIndex+s.index)%len(sortedNodes)]
nodeIndex += 1
if canAllocPartition(node, s.nodeType) {
if excludeHosts == nil || !contains(excludeHosts, node.GetAddr()) {
selectedIndex = nodeIndex - 1
break
}
}
}
// if we get a writable node, append it to host list
if selectedIndex != len(sortedNodes) {
node := sortedNodes[(selectedIndex+s.index)%len(sortedNodes)]
orderHosts = append(orderHosts, node.GetAddr())
node.SelectNodeForWrite()
peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
peers = append(peers, peer)
}
}
// if we cannot get enough writable nodes, return error
if len(orderHosts) < replicaNum {
err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v MatchNodeCount:%v ",
s.GetName(), replicaNum, len(orderHosts))
return
}
// move the index of selector
s.index += nodeIndex
log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
// reshuffle for primary-backup replication
if newHosts, err = reshuffleHosts(orderHosts); err != nil {
err = fmt.Errorf("action[%vNodeSelector::Select] err:%v orderHosts is nil", s.GetName(), err.Error())
return
}
return
}
func NewRoundRobinNodeSelector(nodeType NodeType) *RoundRobinNodeSelector {
return &RoundRobinNodeSelector{
nodeType: nodeType,
}
}
const (
StrawNodeSelectorRandMax = 65536
)
// NOTE: this node selector inspired by Straw2 algorithm, which is widely used in ceph
type StrawNodeSelector struct {
rand *rand.Rand
nodeType NodeType
}
func (s *StrawNodeSelector) GetName() string {
return StrawNodeSelectorName
}
func (s *StrawNodeSelector) getWeight(node Node) float64 {
switch s.nodeType {
case DataNodeType:
dataNode := node.(*DataNode)
return float64(dataNode.AvailableSpace) / util.GB
case MetaNodeType:
metaNode := node.(*MetaNode)
return float64(metaNode.Total-metaNode.Used) / util.GB
default:
panic("unkown node type")
}
}
func (s *StrawNodeSelector) selectOneNode(nodes []Node) (index int, maxNode Node) {
maxStraw := float64(0)
index = -1
for i, node := range nodes {
straw := float64(s.rand.Intn(StrawNodeSelectorRandMax))
straw = math.Log(straw/float64(StrawNodeSelectorRandMax)) / s.getWeight(node)
if index == -1 || straw > maxStraw {
maxStraw = straw
maxNode = node
index = i
}
}
return
}
func (s *StrawNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
nodes := make([]Node, 0)
ns.getNodes(s.nodeType).Range(func(key, value interface{}) bool {
node := asNodeWrap(value, s.nodeType)
if !contains(excludeHosts, node.GetAddr()) {
nodes = append(nodes, node)
}
return true
})
orderHosts := make([]string, 0)
for len(orderHosts) < replicaNum {
if len(nodes)+len(orderHosts) < replicaNum {
break
}
index, node := s.selectOneNode(nodes)
if index != 0 {
nodes[0], nodes[index] = node, nodes[0]
}
nodes = nodes[1:]
if !canAllocPartition(node, s.nodeType) {
continue
}
orderHosts = append(orderHosts, node.GetAddr())
node.SelectNodeForWrite()
peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
peers = append(peers, peer)
}
// if we cannot get enough writable nodes, return error
if len(orderHosts) < replicaNum {
err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v MatchNodeCount:%v ",
s.GetName(), replicaNum, len(orderHosts))
return
}
log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
// reshuffle for primary-backup replication
if newHosts, err = reshuffleHosts(orderHosts); err != nil {
err = fmt.Errorf("action[%vNodeSelector::Select] err:%v orderHosts is nil", s.GetName(), err.Error())
return
}
return
}
func NewStrawNodeSelector(nodeType NodeType) *StrawNodeSelector {
return &StrawNodeSelector{
rand: rand.New(rand.NewSource(time.Now().UnixMicro())),
nodeType: nodeType,
}
}
func NewNodeSelector(name string, nodeType NodeType) NodeSelector {
switch name {
case RoundRobinNodeSelectorName:
return NewRoundRobinNodeSelector(nodeType)
case CarryWeightNodeSelectorName:
return NewCarryWeightNodeSelector(nodeType)
case AvailableSpaceFirstNodeSelectorName:
return NewAvailableSpaceFirstNodeSelector(nodeType)
case StrawNodeSelectorName:
return NewStrawNodeSelector(nodeType)
default:
return NewCarryWeightNodeSelector(nodeType)
}
}
func (ns *nodeSet) getAvailMetaNodeHosts(excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
ns.nodeSelectLock.Lock()
defer ns.nodeSelectLock.Unlock()
// we need a read lock to block the modify of node selector
ns.metaNodeSelectorLock.RLock()
defer ns.metaNodeSelectorLock.RUnlock()
return ns.metaNodeSelector.Select(ns, excludeHosts, replicaNum)
}
func (ns *nodeSet) getAvailDataNodeHosts(excludeHosts []string, replicaNum int) (hosts []string, peers []proto.Peer, err error) {
ns.nodeSelectLock.Lock()
defer ns.nodeSelectLock.Unlock()
// we need a read lock to block the modify of node selector
ns.dataNodeSelectorLock.Lock()
defer ns.dataNodeSelectorLock.Unlock()
return ns.dataNodeSelector.Select(ns, excludeHosts, replicaNum)
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"math"
"math/rand"
"sort"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
)
const RoundRobinNodesetSelectorName = "RoundRobin"
const CarryWeightNodesetSelectorName = "CarryWeight"
const AvailableSpaceFirstNodesetSelectorName = "AvailableSpaceFirst"
const StrawNodesetSelectorName = "Straw"
const DefaultNodesetSelectorName = RoundRobinNodeSelectorName
func (ns *nodeSet) getDataNodeTotalSpace() (toalSpace uint64) {
ns.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
toalSpace += dataNode.Total
return true
})
return
}
func (ns *nodeSet) getMetaNodeTotalSpace() (toalSpace uint64) {
ns.metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
toalSpace += metaNode.Total
return true
})
return
}
func (ns *nodeSet) getDataNodeTotalAvailableSpace() (space uint64) {
ns.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if !dataNode.ToBeOffline {
space += dataNode.AvailableSpace
}
return true
})
return
}
func (ns *nodeSet) getMetaNodeTotalAvailableSpace() (space uint64) {
ns.metaNodes.Range(func(key, value interface{}) bool {
metaNode := value.(*MetaNode)
if !metaNode.ToBeOffline {
space += metaNode.Total - metaNode.Used
}
return true
})
return
}
func (ns *nodeSet) canWriteFor(nodeType NodeType, replica int) bool {
switch nodeType {
case DataNodeType:
return ns.canWriteForDataNode(replica)
case MetaNodeType:
return ns.canWriteForMetaNode(replica)
default:
panic("unknow node type")
}
}
func (ns *nodeSet) getTotalSpaceOf(nodeType NodeType) uint64 {
switch nodeType {
case DataNodeType:
return ns.getDataNodeTotalSpace()
case MetaNodeType:
return ns.getMetaNodeTotalSpace()
default:
panic("unknow node type")
}
}
func (ns *nodeSet) getTotalAvailableSpaceOf(nodeType NodeType) uint64 {
switch nodeType {
case DataNodeType:
return ns.getDataNodeTotalAvailableSpace()
case MetaNodeType:
return ns.getMetaNodeTotalAvailableSpace()
default:
panic("unknow node type")
}
}
type NodesetSelector interface {
GetName() string
Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error)
}
type RoundRobinNodesetSelector struct {
index int
nodeType NodeType
}
func (s *RoundRobinNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
// sort nodesets by id, so we can get a node list that is as stable as possible
sort.Slice(nsc, func(i, j int) bool {
return nsc[i].ID < nsc[j].ID
})
for i := 0; i < len(nsc); i++ {
if s.index >= len(nsc) {
s.index = 0
}
ns = nsc[s.index]
s.index++
if containsID(excludeNodeSets, ns.ID) {
continue
}
if ns.canWriteFor(s.nodeType, int(replicaNum)) {
return
}
}
switch s.nodeType {
case DataNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
case MetaNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
default:
panic("unknow node type")
}
return
}
func (s *RoundRobinNodesetSelector) GetName() string {
return RoundRobinNodesetSelectorName
}
func NewRoundRobinNodesetSelector(nodeType NodeType) *RoundRobinNodesetSelector {
return &RoundRobinNodesetSelector{
nodeType: nodeType,
}
}
type CarryWeightNodesetSelector struct {
carrys map[uint64]float64
nodeType NodeType
}
func (s *CarryWeightNodesetSelector) GetName() string {
return CarryWeightNodesetSelectorName
}
func (s *CarryWeightNodesetSelector) getMaxTotal(nsc nodeSetCollection) uint64 {
total := uint64(0)
for i := 0; i < nsc.Len(); i++ {
tmp := nsc[i].getTotalSpaceOf(s.nodeType)
if tmp > total {
total = tmp
}
}
return total
}
func (s *CarryWeightNodesetSelector) prepareCarry(nsc nodeSetCollection, total uint64) {
for _, nodeset := range nsc {
id := nodeset.ID
if _, ok := s.carrys[id]; !ok {
// use total available space to calculate initial weight
s.carrys[id] = float64(nodeset.getTotalAvailableSpaceOf(s.nodeType)) / float64(total)
}
}
}
func (s *CarryWeightNodesetSelector) getAvailNodesets(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (newNsc nodeSetCollection) {
newNsc = make(nodeSetCollection, 0, nsc.Len())
for i := 0; i < nsc.Len(); i++ {
ns := nsc[i]
if ns.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, ns.ID) {
newNsc = append(newNsc, ns)
}
}
return
}
func (s *CarryWeightNodesetSelector) getCarryCount(nsc nodeSetCollection) (count int) {
for i := 0; i < nsc.Len(); i++ {
ns := nsc[i]
if s.carrys[ns.ID] >= 1.0 {
count += 1
}
}
return
}
func (s *CarryWeightNodesetSelector) setNodesetCarry(nsc nodeSetCollection, total uint64) int {
count := s.getCarryCount(nsc)
for count < 1 {
count = 0
for i := 0; i < nsc.Len(); i++ {
nset := nsc[i]
weight := float64(nset.getTotalAvailableSpaceOf(s.nodeType)) / float64(total)
s.carrys[nset.ID] += weight
if s.carrys[nset.ID] >= 1.0 {
count += 1
}
// limit the max value of weight
if s.carrys[nset.ID] > 10.0 {
s.carrys[nset.ID] = 10.0
}
}
}
return count
}
func (s *CarryWeightNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
total := s.getMaxTotal(nsc)
// prepare weight of evert nodesets
s.prepareCarry(nsc, total)
nsc = s.getAvailNodesets(nsc, excludeNodeSets, replicaNum)
avaliCount := 0
if len(nsc) < 1 {
goto err
}
avaliCount = s.setNodesetCarry(nsc, total)
// sort nodesets by weight
sort.Slice(nsc, func(i, j int) bool {
return s.carrys[nsc[i].ID] > s.carrys[nsc[j].ID]
})
// pick the first nodeset than has N writable node
for i := 0; i < avaliCount; i++ {
ns = nsc[i]
if ns.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, ns.ID) {
break
}
}
if ns != nil {
if !ns.canWriteFor(s.nodeType, int(replicaNum)) || containsID(excludeNodeSets, ns.ID) {
goto err
}
s.carrys[ns.ID] -= 1.0
}
return
err:
switch s.nodeType {
case DataNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
case MetaNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
default:
panic("unknow node type")
}
return
}
func NewCarryWeightNodesetSelector(nodeType NodeType) *CarryWeightNodesetSelector {
return &CarryWeightNodesetSelector{
carrys: make(map[uint64]float64),
nodeType: nodeType,
}
}
type AvailableSpaceFirstNodesetSelector struct {
nodeType NodeType
}
func (s *AvailableSpaceFirstNodesetSelector) GetName() string {
return AvailableSpaceFirstNodesetSelectorName
}
func (s *AvailableSpaceFirstNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
// sort nodesets by available space
sort.Slice(nsc, func(i, j int) bool {
return nsc[i].getTotalAvailableSpaceOf(s.nodeType) > nsc[j].getTotalAvailableSpaceOf(s.nodeType)
})
// pick the first nodeset that has N writable nodes
for i := 0; i < nsc.Len(); i++ {
ns = nsc[i]
if ns.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, ns.ID) {
return
}
}
switch s.nodeType {
case DataNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
case MetaNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
default:
panic("unknow node type")
}
return
}
func NewAvailableSpaceFirstNodesetSelector(nodeType NodeType) *AvailableSpaceFirstNodesetSelector {
return &AvailableSpaceFirstNodesetSelector{
nodeType: nodeType,
}
}
const (
StrawNodesetSelectorRandMax = 65536
)
// NOTE: this nodeset selector inspired by Straw2 algorithm, which is widely used in ceph
type StrawNodesetSelector struct {
nodeType NodeType
rand *rand.Rand
}
func (s *StrawNodesetSelector) GetName() string {
return StrawNodesetSelectorName
}
func (s *StrawNodesetSelector) getWeight(ns *nodeSet) float64 {
return float64(ns.getTotalAvailableSpaceOf(s.nodeType) / util.GB)
}
func (s *StrawNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
tmp := make(nodeSetCollection, 0)
for _, nodeset := range nsc {
if nodeset.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, nodeset.ID) {
tmp = append(tmp, nodeset)
}
}
nsc = tmp
if len(nsc) < 1 {
switch s.nodeType {
case DataNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
case MetaNodeType:
err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
default:
panic("unknow node type")
}
return
}
maxStraw := float64(0)
for _, nodeset := range nsc {
straw := float64(s.rand.Intn(StrawNodesetSelectorRandMax))
straw = math.Log(straw/float64(StrawNodesetSelectorRandMax)) / s.getWeight(nodeset)
if ns == nil || straw > maxStraw {
ns = nodeset
maxStraw = straw
}
}
return
}
func NewStrawNodesetSelector(nodeType NodeType) *StrawNodesetSelector {
return &StrawNodesetSelector{
nodeType: nodeType,
rand: rand.New(rand.NewSource(time.Now().Unix())),
}
}
func NewNodesetSelector(name string, nodeType NodeType) NodesetSelector {
switch name {
case CarryWeightNodesetSelectorName:
return NewCarryWeightNodesetSelector(nodeType)
case RoundRobinNodesetSelectorName:
return NewRoundRobinNodesetSelector(nodeType)
case AvailableSpaceFirstNodesetSelectorName:
return NewAvailableSpaceFirstNodesetSelector(nodeType)
case StrawNodesetSelectorName:
return NewStrawNodesetSelector(nodeType)
default:
return NewRoundRobinNodesetSelector(nodeType)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
"math/rand"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
func newCreateDataPartitionRequest(volName string, ID uint64, replicaNum int, members []proto.Peer,
dataPartitionSize, leaderSize int, hosts []string, createType int, partitionType int,
decommissionedDisks []string, verSeq uint64) (req *proto.CreateDataPartitionRequest) {
req = &proto.CreateDataPartitionRequest{
PartitionTyp: partitionType,
PartitionId: ID,
PartitionSize: dataPartitionSize,
ReplicaNum: replicaNum,
VolumeId: volName,
Members: members,
Hosts: hosts,
CreateType: createType,
LeaderSize: leaderSize,
DecommissionedDisks: decommissionedDisks,
VerSeq: verSeq,
}
return
}
func newDeleteDataPartitionRequest(ID uint64) (req *proto.DeleteDataPartitionRequest) {
req = &proto.DeleteDataPartitionRequest{
PartitionId: ID,
}
return
}
func newAddDataPartitionRaftMemberRequest(ID uint64, addPeer proto.Peer) (req *proto.AddDataPartitionRaftMemberRequest) {
req = &proto.AddDataPartitionRaftMemberRequest{
PartitionId: ID,
AddPeer: addPeer,
}
return
}
func newRemoveDataPartitionRaftMemberRequest(ID uint64, removePeer proto.Peer) (req *proto.RemoveDataPartitionRaftMemberRequest) {
req = &proto.RemoveDataPartitionRaftMemberRequest{
PartitionId: ID,
RemovePeer: removePeer,
}
return
}
func newLoadDataPartitionMetricRequest(ID uint64) (req *proto.LoadDataPartitionRequest) {
req = &proto.LoadDataPartitionRequest{
PartitionId: ID,
}
return
}
func newStopDataPartitionRepairRequest(ID uint64, stop bool) (req *proto.StopDataPartitionRepairRequest) {
req = &proto.StopDataPartitionRepairRequest{
PartitionId: ID,
Stop: stop,
}
return
}
func unmarshalTaskResponse(task *proto.AdminTask) (err error) {
bytes, err := json.Marshal(task.Response)
if err != nil {
return
}
var response interface{}
switch task.OpCode {
case proto.OpDataNodeHeartbeat:
response = &proto.DataNodeHeartbeatResponse{}
case proto.OpDeleteDataPartition:
response = &proto.DeleteDataPartitionResponse{}
case proto.OpLoadDataPartition:
response = &proto.LoadDataPartitionResponse{}
case proto.OpDeleteFile:
response = &proto.DeleteFileResponse{}
case proto.OpMetaNodeHeartbeat:
response = &proto.MetaNodeHeartbeatResponse{}
case proto.OpDeleteMetaPartition:
response = &proto.DeleteMetaPartitionResponse{}
case proto.OpUpdateMetaPartition:
response = &proto.UpdateMetaPartitionResponse{}
case proto.OpDecommissionMetaPartition:
response = &proto.MetaPartitionDecommissionResponse{}
case proto.OpVersionOperation:
response = &proto.MultiVersionOpResponse{}
case proto.OpLcNodeHeartbeat:
response = &proto.LcNodeHeartbeatResponse{}
case proto.OpLcNodeScan:
response = &proto.LcNodeRuleTaskResponse{}
case proto.OpLcNodeSnapshotVerDel:
response = &proto.SnapshotVerDelTaskResponse{}
default:
log.LogError(fmt.Sprintf("unknown operate code(%v)", task.OpCode))
}
if response == nil {
return fmt.Errorf("unmarshalTaskResponse failed")
}
if err = json.Unmarshal(bytes, response); err != nil {
return
}
task.Response = response
return
}
func contains(arr []string, element string) (ok bool) {
if arr == nil || len(arr) == 0 {
return
}
for _, e := range arr {
if e == element {
ok = true
break
}
}
return
}
func containsID(arr []uint64, element uint64) bool {
if arr == nil || len(arr) == 0 {
return false
}
for _, e := range arr {
if e == element {
return true
}
}
return false
}
func reshuffleHosts(oldHosts []string) (newHosts []string, err error) {
if oldHosts == nil || len(oldHosts) == 0 {
log.LogError(fmt.Sprintf("action[reshuffleHosts],err:%v", proto.ErrReshuffleArray))
err = proto.ErrReshuffleArray
return
}
lenOldHosts := len(oldHosts)
newHosts = make([]string, lenOldHosts)
if lenOldHosts == 1 {
copy(newHosts, oldHosts)
return
}
for i := lenOldHosts; i > 1; i-- {
rand.Seed(time.Now().UnixNano())
oCurrPos := rand.Intn(i)
oldHosts[i-1], oldHosts[oCurrPos] = oldHosts[oCurrPos], oldHosts[i-1]
}
copy(newHosts, oldHosts)
return
}
// Warn provides warnings when exits
func Warn(clusterID, msg string) {
key := fmt.Sprintf("%s_%s", clusterID, ModuleName)
WarnBySpecialKey(key, msg)
}
// WarnBySpecialKey provides warnings when exits
func WarnBySpecialKey(key, msg string) {
log.LogWarn(msg)
exporter.Warning(msg)
}
func keyNotFound(name string) (err error) {
return errors.NewErrorf("parameter %v not found", name)
}
func unmatchedKey(name string) (err error) {
return errors.NewErrorf("parameter %v not match", name)
}
func txInvalidMask() (err error) {
return errors.New("transaction mask key value pair should be: enableTxMaskKey=[create|mkdir|remove|rename|mknod|symlink|link]\n enableTxMaskKey=off \n enableTxMaskKey=all")
}
func notFoundMsg(name string) (err error) {
return errors.NewErrorf("%v not found", name)
}
func metaPartitionNotFound(id uint64) (err error) {
return notFoundMsg(fmt.Sprintf("meta partition[%v]", id))
}
func metaReplicaNotFound(addr string) (err error) {
return notFoundMsg(fmt.Sprintf("meta replica[%v]", addr))
}
func dataPartitionNotFound(id uint64) (err error) {
return notFoundMsg(fmt.Sprintf("data partition[%v]", id))
}
func dataReplicaNotFound(addr string) (err error) {
return notFoundMsg(fmt.Sprintf("data replica[%v]", addr))
}
func zoneNotFound(name string) (err error) {
return notFoundMsg(fmt.Sprintf("zone[%v]", name))
}
func nodeSetNotFound(id uint64) (err error) {
return notFoundMsg(fmt.Sprintf("node set[%v]", id))
}
func dataNodeNotFound(addr string) (err error) {
return notFoundMsg(fmt.Sprintf("data node[%v]", addr))
}
func metaNodeNotFound(addr string) (err error) {
return notFoundMsg(fmt.Sprintf("meta node[%v]", addr))
}
func lcNodeNotFound(addr string) (err error) {
return notFoundMsg(fmt.Sprintf("lc node[%v]", addr))
}
func volNotFound(name string) (err error) {
return notFoundMsg(fmt.Sprintf("vol[%v]", name))
}
func matchKey(serverKey, clientKey string) bool {
h := md5.New()
_, err := h.Write([]byte(serverKey))
if err != nil {
log.LogWarnf("action[matchKey] write server key[%v] failed,err[%v]", serverKey, err)
return false
}
cipherStr := h.Sum(nil)
return strings.ToLower(clientKey) == strings.ToLower(hex.EncodeToString(cipherStr))
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"context"
"fmt"
syslog "log"
"net/http"
"net/http/httputil"
"regexp"
"strconv"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/raftstore/raftstore_db"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/cryptoutil"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
// configuration keys
const (
ClusterName = "clusterName"
ID = "id"
IP = "ip"
Port = "port"
LogLevel = "logLevel"
LogDir = "logDir"
WalDir = "walDir"
StoreDir = "storeDir"
EbsAddrKey = "ebsAddr"
BStoreAddrKey = "bStoreAddr"
EbsServicePathKey = "ebsServicePath"
BStoreServicePathKey = "bStoreServicePath"
GroupID = 1
ModuleName = "master"
CfgRetainLogs = "retainLogs"
DefaultRetainLogs = 20000
cfgTickInterval = "tickInterval"
cfgRaftRecvBufSize = "raftRecvBufSize"
cfgElectionTick = "electionTick"
SecretKey = "masterServiceKey"
Stat = "stat"
Authenticate = "authenticate"
AuthNodeHost = "authNodeHost"
AuthNodeEnableHTTPS = "authNodeEnableHTTPS"
AuthNodeCertFile = "authNodeCertFile"
)
var (
// regexps for data validation
volNameRegexp = regexp.MustCompile("^[a-zA-Z0-9][a-zA-Z0-9_.-]{1,61}[a-zA-Z0-9]$")
ownerRegexp = regexp.MustCompile("^[A-Za-z][A-Za-z0-9_]{0,20}$")
useConnPool = true // for test
gConfig *clusterConfig
)
var overSoldFactor = defaultOverSoldFactor
func overSoldLimit() bool {
if overSoldFactor <= 0 {
return false
}
return true
}
func overSoldCap(cap uint64) uint64 {
if overSoldFactor <= 0 {
return cap
}
return uint64(float32(cap) * overSoldFactor)
}
func setOverSoldFactor(factor float32) {
if factor != overSoldFactor {
overSoldFactor = factor
}
}
var volNameErr = errors.New("name can only start and end with number or letters, and len can't less than 3")
// Server represents the server in a cluster
type Server struct {
id uint64
clusterName string
ip string
bindIp bool
port string
logDir string
walDir string
storeDir string
bStoreAddr string
servicePath string
retainLogs uint64
tickInterval int
raftRecvBufSize int
electionTick int
leaderInfo *LeaderInfo
config *clusterConfig
cluster *Cluster
user *User
rocksDBStore *raftstore_db.RocksDBStore
raftStore raftstore.RaftStore
fsm *MetadataFsm
partition raftstore.Partition
wg sync.WaitGroup
reverseProxy *httputil.ReverseProxy
metaReady bool
apiServer *http.Server
}
// NewServer creates a new server
func NewServer() *Server {
return &Server{}
}
// Start starts a server
func (m *Server) Start(cfg *config.Config) (err error) {
m.config = newClusterConfig()
gConfig = m.config
m.leaderInfo = &LeaderInfo{}
m.reverseProxy = m.newReverseProxy()
if err = m.checkConfig(cfg); err != nil {
log.LogError(errors.Stack(err))
return
}
if m.rocksDBStore, err = raftstore_db.NewRocksDBStoreAndRecovery(m.storeDir, LRUCacheSize, WriteBufferSize); err != nil {
return
}
if err = m.createRaftServer(cfg); err != nil {
log.LogError(errors.Stack(err))
return
}
m.initCluster()
m.initUser()
m.cluster.partition = m.partition
m.cluster.idAlloc.partition = m.partition
MasterSecretKey := cfg.GetString(SecretKey)
if m.cluster.MasterSecretKey, err = cryptoutil.Base64Decode(MasterSecretKey); err != nil {
return fmt.Errorf("action[Start] failed %v, err: master service Key invalid = %s", proto.ErrInvalidCfg, MasterSecretKey)
}
m.cluster.authenticate = cfg.GetBool(Authenticate)
if m.cluster.authenticate {
m.cluster.initAuthentication(cfg)
}
m.cluster.scheduleTask()
m.startHTTPService(ModuleName, cfg)
exporter.RegistConsul(m.clusterName, ModuleName, cfg)
WarnMetrics = newWarningMetrics(m.cluster)
metricsService := newMonitorMetrics(m.cluster)
metricsService.start()
_, err = stat.NewStatistic(m.logDir, Stat, int64(stat.DefaultStatLogSize),
stat.DefaultTimeOutUs, true)
m.wg.Add(1)
return nil
}
// Shutdown closes the server
func (m *Server) Shutdown() {
var err error
if m.apiServer != nil {
if err = m.apiServer.Shutdown(context.Background()); err != nil {
log.LogErrorf("action[Shutdown] failed, err: %v", err)
}
}
stat.CloseStat()
// stop raftServer first
if m.fsm != nil {
m.fsm.Stop()
}
// then stop rocksDBStore
time.Sleep(time.Second)
if m.rocksDBStore != nil {
m.rocksDBStore.Close()
}
m.wg.Done()
}
// Sync waits for the execution termination of the server
func (m *Server) Sync() {
m.wg.Wait()
}
func (m *Server) checkConfig(cfg *config.Config) (err error) {
m.clusterName = cfg.GetString(ClusterName)
m.ip = cfg.GetString(IP)
m.bindIp = cfg.GetBool(proto.BindIpKey)
m.port = cfg.GetString(proto.ListenPort)
m.logDir = cfg.GetString(LogDir)
m.walDir = cfg.GetString(WalDir)
m.storeDir = cfg.GetString(StoreDir)
m.bStoreAddr = cfg.GetString(BStoreAddrKey)
if m.bStoreAddr == "" {
m.bStoreAddr = cfg.GetString(EbsAddrKey)
}
m.servicePath = cfg.GetString(BStoreServicePathKey)
if m.servicePath == "" {
m.servicePath = cfg.GetString(EbsServicePathKey)
}
peerAddrs := cfg.GetString(cfgPeers)
if m.port == "" || m.walDir == "" || m.storeDir == "" || m.clusterName == "" || peerAddrs == "" {
return fmt.Errorf("%v,err:%v,%v,%v,%v,%v,%v", proto.ErrInvalidCfg, "one of (listen,walDir,storeDir,clusterName) is null",
m.port, m.walDir, m.storeDir, m.clusterName, peerAddrs)
}
if m.id, err = strconv.ParseUint(cfg.GetString(ID), 10, 64); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
m.config.DisableAutoCreate = cfg.GetBoolWithDefault(disableAutoCreate, false)
syslog.Printf("get disableAutoCreate cfg %v", m.config.DisableAutoCreate)
m.config.faultDomain = cfg.GetBoolWithDefault(faultDomain, false)
m.config.heartbeatPort = cfg.GetInt64(heartbeatPortKey)
m.config.replicaPort = cfg.GetInt64(replicaPortKey)
if m.config.heartbeatPort <= 1024 {
m.config.heartbeatPort = raftstore.DefaultHeartbeatPort
}
if m.config.replicaPort <= 1024 {
m.config.replicaPort = raftstore.DefaultReplicaPort
}
syslog.Printf("heartbeatPort[%v],replicaPort[%v]\n", m.config.heartbeatPort, m.config.replicaPort)
if err = m.config.parsePeers(peerAddrs); err != nil {
return
}
nodeSetCapacity := cfg.GetString(nodeSetCapacity)
if nodeSetCapacity != "" {
if m.config.nodeSetCapacity, err = strconv.Atoi(nodeSetCapacity); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
if m.config.nodeSetCapacity < 3 {
m.config.nodeSetCapacity = defaultNodeSetCapacity
}
m.config.DefaultNormalZoneCnt = defaultNodeSetGrpBatchCnt
m.config.DomainBuildAsPossible = cfg.GetBoolWithDefault(cfgDomainBuildAsPossible, false)
domainBatchGrpCnt := cfg.GetString(cfgDomainBatchGrpCnt)
if domainBatchGrpCnt != "" {
if m.config.DefaultNormalZoneCnt, err = strconv.Atoi(domainBatchGrpCnt); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
metaNodeReservedMemory := cfg.GetString(cfgMetaNodeReservedMem)
if metaNodeReservedMemory != "" {
if m.config.metaNodeReservedMem, err = strconv.ParseUint(metaNodeReservedMemory, 10, 64); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
if m.config.metaNodeReservedMem < 32*1024*1024 {
m.config.metaNodeReservedMem = defaultMetaNodeReservedMem
}
retainLogs := cfg.GetString(CfgRetainLogs)
if retainLogs != "" {
if m.retainLogs, err = strconv.ParseUint(retainLogs, 10, 64); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
if m.retainLogs <= 0 {
m.retainLogs = DefaultRetainLogs
}
syslog.Println("retainLogs=", m.retainLogs)
missingDataPartitionInterval := cfg.GetString(missingDataPartitionInterval)
if missingDataPartitionInterval != "" {
if m.config.MissingDataPartitionInterval, err = strconv.ParseInt(missingDataPartitionInterval, 10, 0); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
dpNoLeaderReportInterval := cfg.GetString(cfgDpNoLeaderReportIntervalSec)
if dpNoLeaderReportInterval != "" {
if m.config.DpNoLeaderReportIntervalSec, err = strconv.ParseInt(dpNoLeaderReportInterval, 10, 0); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
mpNoLeaderReportInterval := cfg.GetString(cfgMpNoLeaderReportIntervalSec)
if mpNoLeaderReportInterval != "" {
if m.config.MpNoLeaderReportIntervalSec, err = strconv.ParseInt(mpNoLeaderReportInterval, 10, 0); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
dataPartitionTimeOutSec := cfg.GetString(dataPartitionTimeOutSec)
if dataPartitionTimeOutSec != "" {
if m.config.DataPartitionTimeOutSec, err = strconv.ParseInt(dataPartitionTimeOutSec, 10, 0); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
numberOfDataPartitionsToLoad := cfg.GetString(NumberOfDataPartitionsToLoad)
if numberOfDataPartitionsToLoad != "" {
if m.config.numberOfDataPartitionsToLoad, err = strconv.Atoi(numberOfDataPartitionsToLoad); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
if m.config.numberOfDataPartitionsToLoad <= 40 {
m.config.numberOfDataPartitionsToLoad = 40
}
if secondsToFreeDP := cfg.GetString(secondsToFreeDataPartitionAfterLoad); secondsToFreeDP != "" {
if m.config.secondsToFreeDataPartitionAfterLoad, err = strconv.ParseInt(secondsToFreeDP, 10, 64); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
intervalToScanS3ExpirationVal := cfg.GetString(intervalToScanS3Expiration)
if intervalToScanS3ExpirationVal != "" {
if m.config.IntervalToScanS3Expiration, err = strconv.ParseInt(intervalToScanS3ExpirationVal, 10, 0); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
m.tickInterval = int(cfg.GetFloat(cfgTickInterval))
m.raftRecvBufSize = int(cfg.GetInt(cfgRaftRecvBufSize))
m.electionTick = int(cfg.GetFloat(cfgElectionTick))
if m.tickInterval <= 300 {
m.tickInterval = 500
}
if m.electionTick <= 3 {
m.electionTick = 5
}
maxQuotaNumPerVol := cfg.GetString(cfgMaxQuotaNumPerVol)
if maxQuotaNumPerVol != "" {
if m.config.MaxQuotaNumPerVol, err = strconv.Atoi(maxQuotaNumPerVol); err != nil {
return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
}
}
m.config.MonitorPushAddr = cfg.GetString(cfgMonitorPushAddr)
m.config.volForceDeletion = cfg.GetBoolWithDefault(cfgVolForceDeletion, true)
threshold := cfg.GetInt64WithDefault(cfgVolDeletionDentryThreshold, 0)
if threshold < 0 {
return fmt.Errorf("volDeletionDentryThreshold can't be less than 0 ! ")
}
m.config.volDeletionDentryThreshold = uint64(threshold)
return
}
func (m *Server) createRaftServer(cfg *config.Config) (err error) {
raftCfg := &raftstore.Config{
NodeID: m.id,
RaftPath: m.walDir,
IPAddr: cfg.GetString(IP),
NumOfLogsToRetain: m.retainLogs,
HeartbeatPort: int(m.config.heartbeatPort),
ReplicaPort: int(m.config.replicaPort),
TickInterval: m.tickInterval,
ElectionTick: m.electionTick,
RecvBufSize: m.raftRecvBufSize,
}
if m.raftStore, err = raftstore.NewRaftStore(raftCfg, cfg); err != nil {
return errors.Trace(err, "NewRaftStore failed! id[%v] walPath[%v]", m.id, m.walDir)
}
syslog.Printf("peers[%v],tickInterval[%v],electionTick[%v]\n", m.config.peers, m.tickInterval, m.electionTick)
m.initFsm()
partitionCfg := &raftstore.PartitionConfig{
ID: GroupID,
Peers: m.config.peers,
Applied: m.fsm.applied,
SM: m.fsm,
}
if m.partition, err = m.raftStore.CreatePartition(partitionCfg); err != nil {
return errors.Trace(err, "CreatePartition failed")
}
return
}
func (m *Server) initFsm() {
m.fsm = newMetadataFsm(m.rocksDBStore, m.retainLogs, m.raftStore.RaftServer())
m.fsm.registerLeaderChangeHandler(m.handleLeaderChange)
m.fsm.registerPeerChangeHandler(m.handlePeerChange)
// register the handlers for the interfaces defined in the Raft library
m.fsm.registerApplySnapshotHandler(m.handleApplySnapshot)
m.fsm.registerRaftUserCmdApplyHandler(m.handleRaftUserCmd)
m.fsm.restore()
}
func (m *Server) initCluster() {
log.LogInfo("action[initCluster] begin")
m.cluster = newCluster(m.clusterName, m.leaderInfo, m.fsm, m.partition, m.config)
m.cluster.retainLogs = m.retainLogs
log.LogInfo("action[initCluster] end")
// incase any limiter on follower
log.LogInfo("action[loadApiLimiterInfo] begin")
m.cluster.loadApiLimiterInfo()
log.LogInfo("action[loadApiLimiterInfo] end")
}
func (m *Server) initUser() {
log.LogInfo("action[initUser] begin")
m.user = newUser(m.fsm, m.partition)
log.LogInfo("action[initUser] end")
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
type snapshotDelManager struct {
cluster *Cluster
lcSnapshotTaskStatus *lcSnapshotVerStatus
lcNodeStatus *lcNodeStatus
idleNodeCh chan struct{}
exitCh chan struct{}
}
func newSnapshotManager() *snapshotDelManager {
log.LogInfof("action[newSnapshotManager] construct")
snapshotMgr := &snapshotDelManager{
lcSnapshotTaskStatus: newLcSnapshotVerStatus(),
lcNodeStatus: newLcNodeStatus(),
idleNodeCh: make(chan struct{}, 1000), // support notify multi snapshot tasks
exitCh: make(chan struct{}),
}
return snapshotMgr
}
func (m *snapshotDelManager) process() {
for {
select {
case <-m.exitCh:
log.LogInfo("exitCh notified, snapshotDelManager process exit")
return
case <-m.idleNodeCh:
log.LogDebug("idleLcNodeCh notified")
task := m.lcSnapshotTaskStatus.GetOneTask()
if task == nil {
log.LogDebugf("lcSnapshotTaskStatus.GetOneTask, no task")
continue
}
nodeAddr := m.lcNodeStatus.GetIdleNode()
if nodeAddr == "" {
log.LogWarn("no idle lcnode, redo task")
m.lcSnapshotTaskStatus.RedoTask(task)
continue
}
val, ok := m.cluster.lcNodes.Load(nodeAddr)
if !ok {
log.LogErrorf("lcNodes.Load, nodeAddr(%v) is not available, redo task", nodeAddr)
m.lcNodeStatus.RemoveNode(nodeAddr)
m.lcSnapshotTaskStatus.RedoTask(task)
continue
}
node := val.(*LcNode)
adminTask := node.createSnapshotVerDelTask(m.cluster.masterAddr(), task)
m.cluster.addLcNodeTasks([]*proto.AdminTask{adminTask})
log.LogDebugf("add snapshot version del task(%v) to lcnode(%v)", *task, nodeAddr)
}
}
}
func (m *snapshotDelManager) notifyIdleLcNode() {
m.lcSnapshotTaskStatus.RLock()
defer m.lcSnapshotTaskStatus.RUnlock()
if len(m.lcSnapshotTaskStatus.VerInfos) > 0 {
select {
case m.idleNodeCh <- struct{}{}:
log.LogDebug("action[handleLcNodeHeartbeatResp], snapshotDelManager scan routine notified!")
default:
log.LogDebug("action[handleLcNodeHeartbeatResp], snapshotDelManager skipping notify!")
}
}
}
//----------------------------------------------
type lcSnapshotVerStatus struct {
sync.RWMutex
VerInfos map[string]*proto.SnapshotVerDelTask
TaskResults map[string]*proto.SnapshotVerDelTaskResponse
}
func newLcSnapshotVerStatus() *lcSnapshotVerStatus {
return &lcSnapshotVerStatus{
VerInfos: make(map[string]*proto.SnapshotVerDelTask),
TaskResults: make(map[string]*proto.SnapshotVerDelTaskResponse),
}
}
func (vs *lcSnapshotVerStatus) GetOneTask() (task *proto.SnapshotVerDelTask) {
vs.Lock()
defer vs.Unlock()
if len(vs.VerInfos) == 0 {
return
}
for _, i := range vs.VerInfos {
task = i
break
}
if task == nil {
return
}
delete(vs.VerInfos, task.Id)
t := time.Now()
vs.TaskResults[task.Id] = &proto.SnapshotVerDelTaskResponse{
ID: task.Id,
UpdateTime: &t,
}
log.LogDebugf("GetOneTask(%v) and add TaskResults", task)
return
}
func (vs *lcSnapshotVerStatus) RedoTask(task *proto.SnapshotVerDelTask) {
vs.Lock()
defer vs.Unlock()
if task == nil {
return
}
vs.VerInfos[task.Id] = task
}
func (vs *lcSnapshotVerStatus) AddVerInfo(task *proto.SnapshotVerDelTask) {
vs.Lock()
defer vs.Unlock()
if len(vs.VerInfos) > 10000 {
return
}
if _, ok := vs.TaskResults[task.Id]; ok {
log.LogDebugf("VerInfo: %v is in TaskResults, already in processing", task)
return
}
vs.VerInfos[task.Id] = task
log.LogDebugf("AddVerInfo task: %v, now num: %v", task, len(vs.VerInfos))
}
func (vs *lcSnapshotVerStatus) ResetVerInfos() {
vs.Lock()
defer vs.Unlock()
log.LogDebugf("ResetVerInfos remove num %v", len(vs.VerInfos))
vs.VerInfos = make(map[string]*proto.SnapshotVerDelTask)
}
func (vs *lcSnapshotVerStatus) AddResult(resp *proto.SnapshotVerDelTaskResponse) {
vs.Lock()
defer vs.Unlock()
vs.TaskResults[resp.ID] = resp
}
func (vs *lcSnapshotVerStatus) DeleteOldResult() {
vs.Lock()
defer vs.Unlock()
for k, v := range vs.TaskResults {
// delete result that already done
if v.Done == true && time.Now().After(v.EndTime.Add(time.Minute*10)) {
delete(vs.TaskResults, k)
log.LogDebugf("delete result already done: %v", v)
}
// delete result that not done but no updating
if v.Done != true && time.Now().After(v.UpdateTime.Add(time.Minute*10)) {
delete(vs.TaskResults, k)
log.LogWarnf("delete result that not done but no updating: %v", v)
}
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"container/list"
"fmt"
"sort"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
type topology struct {
dataNodes *sync.Map
metaNodes *sync.Map
zoneMap *sync.Map
zoneIndexForDataNode int
zoneIndexForMetaNode int
zones []*Zone
domainExcludeZones []string // not domain zone, empty if domain disable.
zoneLock sync.RWMutex
}
func newTopology() (t *topology) {
t = new(topology)
t.zoneMap = new(sync.Map)
t.dataNodes = new(sync.Map)
t.metaNodes = new(sync.Map)
t.zones = make([]*Zone, 0)
return
}
func (t *topology) zoneLen() int {
t.zoneLock.RLock()
defer t.zoneLock.RUnlock()
return len(t.zones)
}
func (t *topology) clear() {
t.dataNodes.Range(func(key, value interface{}) bool {
t.dataNodes.Delete(key)
return true
})
t.metaNodes.Range(func(key, value interface{}) bool {
t.metaNodes.Delete(key)
return true
})
}
func (t *topology) putZone(zone *Zone) (err error) {
t.zoneLock.Lock()
defer t.zoneLock.Unlock()
if _, ok := t.zoneMap.Load(zone.name); ok {
return fmt.Errorf("zone[%v] has exist", zone.name)
}
t.zoneMap.Store(zone.name, zone)
t.zones = append(t.zones, zone)
return
}
func (t *topology) putZoneIfAbsent(zone *Zone) (beStoredZone *Zone) {
t.zoneLock.Lock()
defer t.zoneLock.Unlock()
oldZone, ok := t.zoneMap.Load(zone.name)
if ok {
return oldZone.(*Zone)
}
t.zoneMap.Store(zone.name, zone)
t.zones = append(t.zones, zone)
beStoredZone = zone
return
}
func (t *topology) getZoneNameList() (zoneList []string) {
zoneList = make([]string, 0)
t.zoneMap.Range(func(zoneName, value interface{}) bool {
zoneList = append(zoneList, zoneName.(string))
return true
})
return zoneList
}
func (t *topology) getZone(name string) (zone *Zone, err error) {
t.zoneMap.Range(func(zoneName, value interface{}) bool {
if zoneName != name {
return true
}
zone = value.(*Zone)
return true
})
if zone == nil {
return nil, fmt.Errorf("zone[%v] is not found", name)
}
return
}
func (t *topology) putDataNode(dataNode *DataNode) (err error) {
if _, ok := t.dataNodes.Load(dataNode.Addr); ok {
return
}
zone, err := t.getZone(dataNode.ZoneName)
if err != nil {
return
}
zone.putDataNode(dataNode)
t.putDataNodeToCache(dataNode)
return
}
func (t *topology) putDataNodeToCache(dataNode *DataNode) {
t.dataNodes.Store(dataNode.Addr, dataNode)
}
func (t *topology) deleteDataNode(dataNode *DataNode) {
zone, err := t.getZone(dataNode.ZoneName)
if err != nil {
return
}
zone.deleteDataNode(dataNode)
t.dataNodes.Delete(dataNode.Addr)
}
func (t *topology) getZoneByDataNode(dataNode *DataNode) (zone *Zone, err error) {
_, ok := t.dataNodes.Load(dataNode.Addr)
if !ok {
return nil, errors.Trace(dataNodeNotFound(dataNode.Addr), "%v not found", dataNode.Addr)
}
return t.getZone(dataNode.ZoneName)
}
func (t *topology) putMetaNode(metaNode *MetaNode) (err error) {
if _, ok := t.metaNodes.Load(metaNode.Addr); ok {
return
}
zone, err := t.getZone(metaNode.ZoneName)
if err != nil {
return
}
zone.putMetaNode(metaNode)
t.putMetaNodeToCache(metaNode)
return
}
func (t *topology) deleteMetaNode(metaNode *MetaNode) {
t.metaNodes.Delete(metaNode.Addr)
zone, err := t.getZone(metaNode.ZoneName)
if err != nil {
return
}
zone.deleteMetaNode(metaNode)
}
func (t *topology) putMetaNodeToCache(metaNode *MetaNode) {
t.metaNodes.Store(metaNode.Addr, metaNode)
}
type nodeSetCollection []*nodeSet
func (nsc nodeSetCollection) Len() int {
return len(nsc)
}
func (nsc nodeSetCollection) Less(i, j int) bool {
return nsc[i].metaNodeLen() < nsc[j].metaNodeLen()
}
func (nsc nodeSetCollection) Swap(i, j int) {
nsc[i], nsc[j] = nsc[j], nsc[i]
}
type nodeSetGroup struct {
ID uint64
domainId uint64
nsgInnerIndex int // worked if alloc num of replica not equal with standard set num of nsg
nodeSets []*nodeSet
nodeSetsIds []uint64
status uint8
sync.RWMutex
}
func newNodeSetGrp(c *Cluster) *nodeSetGroup {
var id uint64
var err error
if id, err = c.idAlloc.allocateCommonID(); err != nil {
return nil
}
log.LogInfof("action[newNodeSetGrp] construct,id[%v]", id)
nsg := &nodeSetGroup{
ID: id,
status: normal,
}
return nsg
}
type DomainNodeSetGrpManager struct {
domainId uint64
nsgIndex int // alloc host from available nodesetGrp with balance policy
nodeSetGrpMap []*nodeSetGroup
zoneAvailableNodeSet map[string]*list.List
nsId2NsGrpMap map[uint64]int // map nodeset id to nodeset group index in nodeSetGrpMap
lastBuildIndex int // build index for 2 plus 1 policy,multi zones need balance build
status uint8 // all nodesetGrp may be unavailable or no nodesetGrp be existed on given policy
nsIdMap map[uint64]int // store all ns already be put into manager
}
type DomainManager struct {
c *Cluster
init bool // manager can't be used in some startup stage before load
domainNodeSetGrpVec []*DomainNodeSetGrpManager
domainId2IndexMap map[uint64]int
ZoneName2DomainIdMap map[string]uint64
excludeZoneListDomain map[string]int // upgrade old datastore old zones use old policy
dataRatioLimit float64
excludeZoneUseRatio float64
sync.RWMutex
}
func newDomainNodeSetGrpManager() *DomainNodeSetGrpManager {
log.LogInfof("action[newDomainManager] construct")
ns := &DomainNodeSetGrpManager{
nsgIndex: 0,
zoneAvailableNodeSet: make(map[string]*list.List),
nsId2NsGrpMap: make(map[uint64]int),
nsIdMap: make(map[uint64]int),
}
return ns
}
func newDomainManager(cls *Cluster) *DomainManager {
log.LogInfof("action[newDomainManager] construct")
ns := &DomainManager{
c: cls,
domainId2IndexMap: make(map[uint64]int),
ZoneName2DomainIdMap: make(map[string]uint64),
excludeZoneListDomain: make(map[string]int),
dataRatioLimit: defaultDomainUsageThreshold,
excludeZoneUseRatio: defaultDomainUsageThreshold,
}
return ns
}
func (nsgm *DomainManager) start() {
log.LogInfof("action[DomainManager:start] start")
nsgm.init = true
}
func (nsgm *DomainManager) createDomain(zoneName string) (err error) {
if nsgm.init == false {
return fmt.Errorf("createDomain err [%v]", err)
}
log.LogInfof("zone name [%v] createDomain", zoneName)
zoneList := strings.Split(zoneName, ",")
grpRegion := newDomainNodeSetGrpManager()
if grpRegion.domainId, err = nsgm.c.idAlloc.allocateCommonID(); err != nil {
return fmt.Errorf("createDomain err [%v]", err)
}
nsgm.Lock()
for i := 0; i < len(zoneList); i++ {
if domainId, ok := nsgm.ZoneName2DomainIdMap[zoneList[i]]; ok {
nsgm.Unlock()
return fmt.Errorf("zone name [%v] exist in domain [%v]", zoneList[i], domainId)
}
}
nsgm.domainNodeSetGrpVec = append(nsgm.domainNodeSetGrpVec, grpRegion)
for i := 0; i < len(zoneList); i++ {
nsgm.ZoneName2DomainIdMap[zoneList[i]] = grpRegion.domainId
nsgm.domainId2IndexMap[grpRegion.domainId] = len(nsgm.domainNodeSetGrpVec) - 1
log.LogInfof("action[createDomain] domainid [%v] zonename [%v] index [%v]", grpRegion.domainId, zoneList[i], len(nsgm.domainNodeSetGrpVec)-1)
}
nsgm.Unlock()
if err = nsgm.c.putZoneDomain(false); err != nil {
return fmt.Errorf("putZoneDomain err [%v]", err)
}
return
}
func (nsgm *DomainManager) checkExcludeZoneState() {
if len(nsgm.excludeZoneListDomain) == 0 {
log.LogInfof("action[checkExcludeZoneState] no excludeZoneList for Domain,size zero")
return
}
excludeNeedDomain := true
log.LogInfof("action[checkExcludeZoneState] excludeZoneList size[%v]", len(nsgm.excludeZoneListDomain))
for zoneNm := range nsgm.excludeZoneListDomain {
if value, ok := nsgm.c.t.zoneMap.Load(zoneNm); ok {
zone := value.(*Zone)
if nsgm.excludeZoneUseRatio == 0 || nsgm.excludeZoneUseRatio > 1 {
nsgm.excludeZoneUseRatio = defaultDomainUsageThreshold
}
if zone.isUsedRatio(nsgm.excludeZoneUseRatio) {
if zone.status == normalZone {
log.LogInfof("action[checkExcludeZoneState] zone[%v] be set unavailableZone", zone.name)
}
zone.status = unavailableZone
} else {
excludeNeedDomain = false
if zone.status == unavailableZone {
log.LogInfof("action[checkExcludeZoneState] zone[%v] be set normalZone", zone.name)
}
zone.status = normalZone
}
}
}
if excludeNeedDomain {
log.LogInfof("action[checkExcludeZoneState] exclude zone cann't be used since now!excludeNeedDomain[%v]",
excludeNeedDomain)
nsgm.c.needFaultDomain = true
} else {
if nsgm.c.needFaultDomain == true {
log.LogInfof("action[checkExcludeZoneState] needFaultDomain be set false")
}
nsgm.c.needFaultDomain = false
}
}
func (nsgm *DomainManager) checkAllGrpState() {
for i := 0; i < len(nsgm.domainNodeSetGrpVec); i++ {
nsgm.checkGrpState(nsgm.domainNodeSetGrpVec[i])
}
}
func (nsgm *DomainManager) checkGrpState(domainGrpManager *DomainNodeSetGrpManager) {
nsgm.RLock()
defer nsgm.RUnlock()
if len(domainGrpManager.nodeSetGrpMap) == 0 {
log.LogInfof("action[checkGrpState] leave,size zero")
return
}
log.LogInfof("action[checkGrpState] nodeSetGrpMap size [%v]", len(domainGrpManager.nodeSetGrpMap))
metaUnAvailableCnt := 0
dataUnAvailableCnt := 0
for i := 0; i < len(domainGrpManager.nodeSetGrpMap); i++ {
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], status[%v]",
i, domainGrpManager.nodeSetGrpMap[i].ID, domainGrpManager.nodeSetGrpMap[i].status)
grpStatus := normal
grpMetaUnAvailableCnt := 0
for j := 0; j < len(domainGrpManager.nodeSetGrpMap[i].nodeSets); j++ {
var (
metaWorked bool
dataWorked bool
used uint64
total uint64
)
domainGrpManager.nodeSetGrpMap[i].nodeSets[j].dataNodes.Range(func(key, value interface{}) bool {
node := value.(*DataNode)
if node.isWriteAble() {
used = used + node.Used
} else {
used = used + node.Total
}
total = total + node.Total
log.LogInfof("action[checkGrpState] nodeid[%v] zonename[%v] used [%v] total [%v] UsageRatio [%v] got available metanode",
node.ID, node.ZoneName, node.Used, node.Total, node.UsageRatio)
return true
})
if float64(used)/float64(total) < nsgm.dataRatioLimit {
dataWorked = true
}
domainGrpManager.nodeSetGrpMap[i].nodeSets[j].metaNodes.Range(func(key, value interface{}) bool {
node := value.(*MetaNode)
if node.isWritable() {
metaWorked = true
log.LogInfof("action[checkGrpState] nodeset[%v] zonename[%v] used [%v] total [%v] threshold [%v] got available metanode",
node.ID, node.ZoneName, node.Used, node.Total, node.Threshold)
return false
}
log.LogInfof("action[checkGrpState] nodeset[%v] zonename[%v] used [%v] total [%v] threshold [%v] got available metanode",
node.ID, node.ZoneName, node.Used, node.Total, node.Threshold)
return true
})
if !metaWorked || !dataWorked {
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], status[%v] be set metaWorked[%v] dataWorked[%v]",
i, domainGrpManager.nodeSetGrpMap[i].ID, domainGrpManager.nodeSetGrpMap[i].status, metaWorked, dataWorked)
if !metaWorked {
grpMetaUnAvailableCnt++
if grpMetaUnAvailableCnt == 2 { // meta can be used if one node is not active
if grpStatus == dataNodesUnAvailable {
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status change from dataNodesUnAvailable to unavailable",
i, domainGrpManager.nodeSetGrpMap[i].ID)
grpStatus = unavailableZone
break
}
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status be set metaNodesUnAvailable",
i, domainGrpManager.nodeSetGrpMap[i].ID)
grpStatus = metaNodesUnAvailable
metaUnAvailableCnt++
}
}
if !dataWorked && grpStatus != dataNodesUnAvailable {
if grpStatus == metaNodesUnAvailable {
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status change from metaNodesUnAvailable to unavailable",
i, domainGrpManager.nodeSetGrpMap[i].ID)
grpStatus = unavailableZone
break
}
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status be set dataNodesUnAvailable",
i, domainGrpManager.nodeSetGrpMap[i].ID)
grpStatus = dataNodesUnAvailable
dataUnAvailableCnt++
}
}
}
domainGrpManager.nodeSetGrpMap[i].status = grpStatus
log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], status[%v] be set normal",
i, domainGrpManager.nodeSetGrpMap[i].ID, domainGrpManager.nodeSetGrpMap[i].status)
}
domainGrpManager.status = normal
if dataUnAvailableCnt == len(domainGrpManager.nodeSetGrpMap) {
domainGrpManager.status = dataNodesUnAvailable
}
if metaUnAvailableCnt == len(domainGrpManager.nodeSetGrpMap) {
if domainGrpManager.status == dataNodesUnAvailable {
domainGrpManager.status = unavailableZone
} else {
domainGrpManager.status = metaNodesUnAvailable
}
}
log.LogInfof("action[checkGrpState] nodesetgrp size [%v] dataUnAvailableCnt [%v] metaUnAvailableCnt [%v] nsgm.status now[%v]",
len(domainGrpManager.nodeSetGrpMap), dataUnAvailableCnt, metaUnAvailableCnt, domainGrpManager.status)
}
type buildNodeSetGrpMethod func(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error)
func (nsgm *DomainManager) buildNodeSetGrp(domainGrpManager *DomainNodeSetGrpManager) (err error) {
log.LogInfof("action[buildNodeSetGrp] available zone [%v]", len(domainGrpManager.zoneAvailableNodeSet))
if len(domainGrpManager.zoneAvailableNodeSet) == 0 {
err = fmt.Errorf("action[buildNodeSetGrp] failed zone available zero")
log.LogErrorf("[%v]", err)
return
}
var method map[int]buildNodeSetGrpMethod
method = make(map[int]buildNodeSetGrpMethod)
method[3] = buildNodeSetGrp3Zone
method[2] = buildNodeSetGrp2Plus1
method[1] = buildNodeSetGrpOneZone
step := defaultNodeSetGrpStep
zoneCnt := nsgm.c.cfg.DefaultNormalZoneCnt
log.LogInfof("action[buildNodeSetGrp] zoncnt [%v]", zoneCnt)
if zoneCnt >= 3 {
zoneCnt = 3
}
if zoneCnt > len(domainGrpManager.zoneAvailableNodeSet) {
if nsgm.c.cfg.DomainBuildAsPossible || domainGrpManager.domainId > 0 {
log.LogInfof("action[buildNodeSetGrp] zoncnt [%v]", zoneCnt)
zoneCnt = len(domainGrpManager.zoneAvailableNodeSet)
} else {
err = fmt.Errorf("action[buildNodeSetGrp] failed zone available [%v] need [%v]", zoneCnt, len(domainGrpManager.zoneAvailableNodeSet))
log.LogErrorf("[%v]", err)
return
}
}
for {
log.LogInfof("action[buildNodeSetGrp] zoneCnt [%v] step [%v]", zoneCnt, step)
err = method[zoneCnt](nsgm, domainGrpManager)
if err != nil {
log.LogInfof("action[buildNodeSetGrp] err [%v]", err)
break
}
step--
if step == 0 {
break
}
}
if domainGrpManager.status != normal || len(domainGrpManager.nodeSetGrpMap) == 0 {
return fmt.Errorf("cann't build new group [%v]", err)
}
return nil
}
func (nsgm *DomainManager) getHostFromNodeSetGrpSpecific(domainGrpManager *DomainNodeSetGrpManager, replicaNum uint8, createType uint32) (
hosts []string,
peers []proto.Peer,
err error,
) {
log.LogErrorf("action[getHostFromNodeSetGrpSpecific] replicaNum[%v],type[%v], nsg cnt[%v], nsg status[%v]",
replicaNum, createType, len(domainGrpManager.nodeSetGrpMap), domainGrpManager.status)
if len(domainGrpManager.nodeSetGrpMap) == 0 {
log.LogErrorf("action[getHostFromNodeSetGrpSpecific] [%v] nodeSetGrpMap zero", domainGrpManager.domainId)
return nil, nil, fmt.Errorf("nodeSetGrpMap zero")
}
nsgm.RLock()
defer nsgm.RUnlock()
var cnt int
nsgIndex := domainGrpManager.nsgIndex
domainGrpManager.nsgIndex = (domainGrpManager.nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)
for {
if cnt >= len(domainGrpManager.nodeSetGrpMap) {
log.LogInfof("action[getHostFromNodeSetGrpSpecific] failed all nsGrp unavailable,cnt[%v]", cnt)
err = fmt.Errorf("action[getHostFromNodeSetGrpSpecific],err:no nsGrp status normal,cnt[%v]", cnt)
break
}
cnt++
nsgIndex = (nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)
nsg := domainGrpManager.nodeSetGrpMap[nsgIndex]
needReplicaNumArray := [3]int{1, 2, 3}
for _, needReplicaNum := range needReplicaNumArray {
var (
host []string
peer []proto.Peer
)
// every replica will look around every nodeset and break if get one
for i := 0; i < defaultFaultDomainZoneCnt; i++ {
ns := nsg.nodeSets[nsg.nsgInnerIndex]
nsg.nsgInnerIndex = (nsg.nsgInnerIndex + 1) % defaultFaultDomainZoneCnt
log.LogInfof("action[getHostFromNodeSetGrpSpecific] nodesetid[%v],zonename[%v], datanode len[%v],metanode len[%v],capacity[%v]",
ns.ID, ns.zoneName, ns.dataNodeLen(), ns.metaNodeLen(), ns.Capacity)
needNum := needReplicaNum
if needReplicaNum > int(replicaNum)-len(hosts) {
needNum = int(replicaNum) - len(hosts)
}
if createType == TypeDataPartition {
if host, peer, err = ns.getAvailDataNodeHosts(nil, needNum); err != nil {
log.LogErrorf("action[getHostFromNodeSetGrpSpecific] ns[%v] zone[%v] TypeDataPartition err[%v]", ns.ID, ns.zoneName, err)
// nsg.status = dataNodesUnAvailable
continue
}
} else {
if host, peer, err = ns.getAvailMetaNodeHosts(nil, needNum); err != nil {
log.LogErrorf("action[getHostFromNodeSetGrpSpecific] ns[%v] zone[%v] TypeMetaPartition err[%v]", ns.ID, ns.zoneName, err)
// nsg.status = metaNodesUnAvailable
continue
}
}
hosts = append(hosts, host...)
peers = append(peers, peer...)
if int(replicaNum) == len(hosts) {
log.LogInfof("action[getHostFromNodeSetGrpSpecific] ngGrp[%v] unable support type[%v] replicaNum[%v]", nsg.ID, createType, replicaNum)
return
}
}
hosts = nil
peers = nil
}
}
return nil, nil, fmt.Errorf("action[getHostFromNodeSetGrpSpecific] cann't alloc host")
}
func (nsgm *DomainManager) getHostFromNodeSetGrp(domainId uint64, replicaNum uint8, createType uint32) (
hosts []string,
peers []proto.Peer,
err error) {
var ok bool
var index int
if index, ok = nsgm.domainId2IndexMap[domainId]; !ok {
err = fmt.Errorf("action[getHostFromNodeSetGrp] not found domainid[%v]", domainId)
return
}
domainGrpManager := nsgm.domainNodeSetGrpVec[index]
log.LogInfof("action[getHostFromNodeSetGrp] domainId [%v] index [%v] replicaNum[%v],type[%v], nsg cnt[%v], nsg status[%v]",
domainId, index, replicaNum, createType, len(domainGrpManager.nodeSetGrpMap), domainGrpManager.status)
// this scenario is abnormal may be caused by zone unavailable in high probability
if domainGrpManager.status != normal {
return nsgm.getHostFromNodeSetGrpSpecific(domainGrpManager, replicaNum, createType)
}
// grp map be build with three zone on standard,no grp if zone less than three,here will build
// nodesetGrp with zones less than three,because offer service is much more important than high available
if len(domainGrpManager.zoneAvailableNodeSet) != 0 {
if nsgm.buildNodeSetGrp(domainGrpManager); len(domainGrpManager.nodeSetGrpMap) == 0 {
err = fmt.Errorf("no usable group")
log.LogErrorf("action[getHostFromNodeSetGrp] no usable group build failed,err[%v]", err)
return
}
} else if len(domainGrpManager.nodeSetGrpMap) == 0 {
err = fmt.Errorf("no usable group")
log.LogInfof("action[getHostFromNodeSetGrp] err[%v]", err)
return
}
nsgm.RLock()
defer nsgm.RUnlock()
var cnt int
nsgIndex := domainGrpManager.nsgIndex
domainGrpManager.nsgIndex = (domainGrpManager.nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)
for {
if cnt >= len(domainGrpManager.nodeSetGrpMap) {
err = fmt.Errorf("action[getHostFromNodeSetGrp] need replica cnt [%v] but get host cnt [%v] from nodesetgrps count[%v]",
replicaNum, len(hosts), cnt)
log.LogErrorf(err.Error())
return nil, nil, err
}
cnt++
nsgIndex = (nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)
nsg := domainGrpManager.nodeSetGrpMap[nsgIndex]
var (
host []string
peer []proto.Peer
)
// it's better to get enough replicas from one nsg(copy set) and will get complement from
// other nsg if not
for i := 0; i < defaultMaxReplicaCnt*len(nsg.nodeSets); i++ {
ns := nsg.nodeSets[nsg.nsgInnerIndex]
log.LogInfof("action[getHostFromNodeSetGrp] nodesetid[%v],zonename[%v], datanode len[%v],metanode len[%v],capacity[%v]",
ns.ID, ns.zoneName, ns.dataNodeLen(), ns.metaNodeLen(), ns.Capacity)
nsg.nsgInnerIndex = (nsg.nsgInnerIndex + 1) % defaultFaultDomainZoneCnt
if nsg.status == unavailableZone {
log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] unavailableZone", ns.ID, ns.zoneName)
continue
}
if createType == TypeDataPartition {
if nsg.status == dataNodesUnAvailable {
log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] dataNodesUnAvailable", ns.ID, ns.zoneName)
continue
}
if host, peer, err = ns.getAvailDataNodeHosts(hosts, 1); err != nil {
log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] TypeDataPartition err[%v]", ns.ID, ns.zoneName, err)
// nsg.status = dataNodesUnAvailable
continue
}
} else {
if nsg.status == metaNodesUnAvailable {
log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] metaNodesUnAvailable", ns.ID, ns.zoneName)
continue
}
if host, peer, err = ns.getAvailMetaNodeHosts(hosts, 1); err != nil {
log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] TypeMetaPartition err[%v]", ns.ID, ns.zoneName, err)
// nsg.status = metaNodesUnAvailable
continue
}
}
hosts = append(hosts, host[0])
peers = append(peers, peer[0])
log.LogInfof("action[getHostFromNodeSetGrp] get host[%v] peer[%v], nsg id[%v] nsgInnerIndex[%v]", host[0], peer[0], nsg.ID, nsg.nsgInnerIndex)
if len(hosts) == int(replicaNum) {
return hosts, peers, nil
}
}
}
}
// nodeset may not
type nsList struct {
lst *list.List
ele *list.Element
zoneName string
}
func (nsgm *DomainManager) buildNodeSetGrpPrepare(domainGrpManager *DomainNodeSetGrpManager) (buildIndex int, zoneAvaVec []nsList) {
sortedKeys := make([]string, 0)
for k := range domainGrpManager.zoneAvailableNodeSet {
sortedKeys = append(sortedKeys, k)
}
sort.Strings(sortedKeys)
for _, zoneName := range sortedKeys {
var zoneInfo nsList
zoneInfo.lst = domainGrpManager.zoneAvailableNodeSet[zoneName]
zoneInfo.zoneName = zoneName
zoneAvaVec = append(zoneAvaVec, zoneInfo)
}
buildIndex = domainGrpManager.lastBuildIndex % len(zoneAvaVec)
domainGrpManager.lastBuildIndex = (domainGrpManager.lastBuildIndex + 1) % len(zoneAvaVec)
return
}
func (nsgm *DomainManager) buildNodeSetGrpDoWork(zoneName string, nodeList *list.List, needCnt int) (resList []nsList, err error) {
log.LogInfof("action[buildNodeSetGrpDoWork] step in")
var tmpList []nsList
ele := nodeList.Front()
for {
if ele == nil {
log.LogInfof("action[buildNodeSetGrpDoWork] zone [%v] can't create nodeset group nodeList not qualified", zoneName)
err = fmt.Errorf("action[buildNodeSetGrpDoWork] zone [%v] can't create nodeset group nodeList not qualified", zoneName)
return
}
nst := ele.Value.(*nodeSet)
log.LogInfof("action[buildNodeSetGrpDoWork] nodeset [%v] zonename [%v] ,metacnt[%v],datacnt[%v]",
nst.ID, nst.zoneName, nst.metaNodeLen(), nst.dataNodeLen())
if nst.dataNodeLen() > 0 && nst.metaNodeLen() > 0 {
var nsl nsList
nsl.lst = nodeList
nsl.ele = ele
nsl.zoneName = zoneName
tmpList = append(tmpList, nsl)
log.LogInfof("action[buildNodeSetGrpDoWork] nodeset [%v] zonename [%v] qualified be put in,metacnt[%v],datacnt[%v]",
nst.ID, nst.zoneName, nst.metaNodeLen(), nst.dataNodeLen())
needCnt = needCnt - 1
if needCnt == 0 {
break
}
}
ele = ele.Next()
}
if needCnt == 0 {
resList = append(resList, tmpList...)
} else {
err = fmt.Errorf("not quliaifed")
}
return
}
func (nsgm *DomainManager) buildNodeSetGrpCommit(resList []nsList, domainGrpManager *DomainNodeSetGrpManager) {
nodeSetGrp := newNodeSetGrp(nsgm.c)
nodeSetGrp.domainId = domainGrpManager.domainId
for i := 0; i < len(resList); i++ {
nst := resList[i].ele.Value.(*nodeSet)
nodeSetGrp.nodeSets = append(nodeSetGrp.nodeSets, nst)
nodeSetGrp.nodeSetsIds = append(nodeSetGrp.nodeSetsIds, nst.ID)
log.LogInfof("action[buildNodeSetGrpCommit] build nodesetGrp id[%v] with append nst id [%v] zoneName [%v]", nodeSetGrp.ID, nst.ID, nst.zoneName)
resList[i].lst.Remove(resList[i].ele)
domainGrpManager.nsId2NsGrpMap[nst.ID] = len(domainGrpManager.nodeSetGrpMap)
if resList[i].lst.Len() == 0 {
delete(domainGrpManager.zoneAvailableNodeSet, resList[i].zoneName)
log.LogInfof("action[buildNodeSetGrpCommit] after grp build no nodeset available for zone[%v],nodesetid:[%v], zonelist size[%v]",
nst.zoneName, nst.ID, len(domainGrpManager.zoneAvailableNodeSet))
}
}
log.LogInfof("action[buildNodeSetGrpCommit] success build nodesetgrp zonelist size[%v], nodesetids[%v]",
len(domainGrpManager.zoneAvailableNodeSet), nodeSetGrp.nodeSetsIds)
domainGrpManager.nodeSetGrpMap = append(domainGrpManager.nodeSetGrpMap, nodeSetGrp)
nsgm.c.putNodeSetGrpInfo(opSyncNodeSetGrp, nodeSetGrp)
domainGrpManager.status = normal
}
// policy of build zone if zone count large then three
func buildNodeSetGrp3Zone(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error) {
nsgm.Lock()
defer nsgm.Unlock()
log.LogInfof("action[buildNodeSetGrp3Zone step in")
if len(domainGrpManager.zoneAvailableNodeSet) < defaultFaultDomainZoneCnt {
log.LogInfof("action[DomainManager::buildNodeSetGrp3Zone] size error,can't create group zone cnt[%v]",
len(domainGrpManager.zoneAvailableNodeSet))
return fmt.Errorf("defaultFaultDomainZoneCnt not satisfied")
}
var resList []nsList
buildIndex, zoneAvaVec := nsgm.buildNodeSetGrpPrepare(domainGrpManager)
cnt := 0
for {
if cnt > 0 {
buildIndex = (buildIndex + 1) % len(zoneAvaVec)
}
if cnt == len(zoneAvaVec) || len(resList) == defaultReplicaNum {
log.LogInfof("step out inner loop in buildNodeSetGrp3Zone cnt [%v], inner index [%v]", cnt, buildIndex)
break
}
cnt++
nodeList := zoneAvaVec[buildIndex].lst
zoneName := zoneAvaVec[buildIndex].zoneName
var tmpList []nsList
if tmpList, err = nsgm.buildNodeSetGrpDoWork(zoneName, nodeList, 1); err != nil {
continue
}
resList = append(resList, tmpList...)
}
if len(resList) < defaultReplicaNum {
log.LogInfof("action[DomainManager::buildNodeSetGrp3Zone] can't create nodeset group nodeset qualified count [%v]", len(resList))
return fmt.Errorf("defaultFaultDomainZoneCnt not satisfied")
}
nsgm.buildNodeSetGrpCommit(resList, domainGrpManager)
return nil
}
func buildNodeSetGrpOneZone(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error) {
nsgm.Lock()
defer nsgm.Unlock()
log.LogInfof("action[buildNodeSetGrpOneZone] step in")
if len(domainGrpManager.zoneAvailableNodeSet) != 1 {
log.LogErrorf("action[buildNodeSetGrpOneZone] available zone cnt[%v]", len(domainGrpManager.zoneAvailableNodeSet))
err = fmt.Errorf("available zone cnt[%v]", len(domainGrpManager.zoneAvailableNodeSet))
return
}
buildIndex, zoneAvaVec := nsgm.buildNodeSetGrpPrepare(domainGrpManager)
if zoneAvaVec[buildIndex].lst.Len() < defaultReplicaNum {
log.LogErrorf("action[buildNodeSetGrpOneZone] not enough nodeset in available list")
return fmt.Errorf("not enough nodeset in available list")
}
var resList []nsList
if resList, err = nsgm.buildNodeSetGrpDoWork(zoneAvaVec[buildIndex].zoneName,
zoneAvaVec[buildIndex].lst, defaultReplicaNum); err != nil {
return err
}
nsgm.buildNodeSetGrpCommit(resList, domainGrpManager)
return nil
}
// build 2 plus 1 nodesetGrp with 2zone or larger
func buildNodeSetGrp2Plus1(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error) {
nsgm.Lock()
defer nsgm.Unlock()
log.LogInfof("step in buildNodeSetGrp2Plus1")
cnt := 0
var resList []nsList
_, zoneAvaVec := nsgm.buildNodeSetGrpPrepare(domainGrpManager)
var np1, np2 int
if zoneAvaVec[0].lst.Len() < zoneAvaVec[1].lst.Len() {
np1 = 0
np2 = 1
} else {
np1 = 1
np2 = 0
}
for i := 2; i < len(zoneAvaVec); i++ {
if zoneAvaVec[i].lst.Len() > zoneAvaVec[np1].lst.Len() {
if zoneAvaVec[i].lst.Len() > zoneAvaVec[np2].lst.Len() {
np2 = i
} else {
np1 = i
}
}
}
if zoneAvaVec[np1].lst.Len() < 1 || zoneAvaVec[np2].lst.Len() < 2 {
log.LogInfof("step out buildNodeSetGrp2Plus1 np1 [%v] np2 [%v] cnt [%v], inner index [%v]",
np1, np2, cnt, domainGrpManager.lastBuildIndex)
return fmt.Errorf("action[buildNodeSetGrp2Plus1] failed")
}
var tmpList []nsList
if tmpList, err = nsgm.buildNodeSetGrpDoWork(zoneAvaVec[np1].zoneName, zoneAvaVec[np1].lst, 1); err != nil {
return
}
resList = append(resList, tmpList...)
if tmpList, err = nsgm.buildNodeSetGrpDoWork(zoneAvaVec[np2].zoneName, zoneAvaVec[np2].lst, 2); err != nil {
return
}
resList = append(resList, tmpList...)
nsgm.buildNodeSetGrpCommit(resList, domainGrpManager)
return
}
func (nsgm *DomainManager) putNodeSet(ns *nodeSet, load bool) (err error) {
nsgm.Lock()
defer nsgm.Unlock()
var (
ok bool
index int
nsGrp *DomainNodeSetGrpManager
domainId uint64
)
if _, ok = nsgm.excludeZoneListDomain[ns.zoneName]; ok {
log.LogInfof("action[DomainManager::putNodeSet] zone[%v],nodesetid:[%v], domain vec size[%v]",
ns.zoneName, ns.ID, len(nsgm.domainNodeSetGrpVec))
return
}
if domainId, ok = nsgm.ZoneName2DomainIdMap[ns.zoneName]; !ok {
domainId = 0 // no domainid be set before;therefore, put it to default domain
nsgm.ZoneName2DomainIdMap[ns.zoneName] = 0
}
if index, ok = nsgm.domainId2IndexMap[domainId]; !ok {
if domainId > 0 && load == false { // domainId 0 can be created through nodeset create,others be created by createDomain
err = fmt.Errorf("inconsistent domainid exist in name map but node exist in index map")
log.LogErrorf("action[putNodeSet] %v", err)
return
}
grpRegion := newDomainNodeSetGrpManager()
nsgm.domainNodeSetGrpVec = append(nsgm.domainNodeSetGrpVec, grpRegion)
nsgm.ZoneName2DomainIdMap[ns.zoneName] = 0 // domainId must be zero here
grpRegion.domainId = domainId
index = len(nsgm.domainNodeSetGrpVec) - 1
nsgm.domainId2IndexMap[domainId] = index
log.LogInfof("action[putNodeSet] build domainId[%v] zoneName [%v] index [%v]", domainId, ns.zoneName, index)
}
nsGrp = nsgm.domainNodeSetGrpVec[index]
if _, ok = nsGrp.nsIdMap[ns.ID]; ok {
log.LogInfof("action[DomainManager::putNodeSet] zone[%v],nodesetid:[%v] already be put before load[%v]",
ns.zoneName, ns.ID, load)
return
}
nsGrp.nsIdMap[ns.ID] = 0
log.LogInfof("action[DomainManager::putNodeSet] zone[%v],nodesetid:[%v], domain vec size[%v], load[%v]",
ns.zoneName, ns.ID, len(nsgm.domainNodeSetGrpVec), load)
// nodeset already be put into grp,this should be happened at condition of load == true
// here hosts in ns should be nullptr and wait node register
if grpidx, ok := nsGrp.nsId2NsGrpMap[ns.ID]; ok {
nsGrp.nodeSetGrpMap[grpidx].nodeSets = append(nsGrp.nodeSetGrpMap[grpidx].nodeSets, ns)
log.LogInfof("action[DomainManager::putNodeSet] zone[%v],nodesetid:[%v] already be put before grp index[%v], grp id[%v] load[%v]",
ns.zoneName, ns.ID, grpidx, nsGrp.nodeSetGrpMap[grpidx].ID, load)
return
}
if _, ok := nsGrp.zoneAvailableNodeSet[ns.zoneName]; !ok {
nsGrp.zoneAvailableNodeSet[ns.zoneName] = list.New()
log.LogInfof("action[DomainManager::putNodeSet] init list for zone[%v],zonelist size[%v]", ns.zoneName, len(nsGrp.zoneAvailableNodeSet))
}
log.LogInfof("action[DomainManager::putNodeSet] domainid [%v] ns id[%v] be put in zone[%v]", nsGrp.domainId, ns.ID, ns.zoneName)
nsGrp.zoneAvailableNodeSet[ns.zoneName].PushBack(ns)
return
}
type nodeSet struct {
ID uint64
Capacity int
zoneName string
metaNodes *sync.Map
dataNodes *sync.Map
decommissionDataPartitionList *DecommissionDataPartitionList
decommissionParallelLimit int32
decommissionDiskParallelFactor float64
nodeSelectLock sync.Mutex
dataNodeSelectorLock sync.RWMutex
dataNodeSelector NodeSelector
metaNodeSelectorLock sync.RWMutex
metaNodeSelector NodeSelector
sync.RWMutex
manualDecommissionDiskList *DecommissionDiskList
autoDecommissionDiskList *DecommissionDiskList
doneDecommissionDiskListTraverse chan struct{}
startDecommissionDiskListTraverse chan struct{}
DecommissionDisks sync.Map
diskParallelFactorLk sync.Mutex
}
type nodeSetDecommissionParallelStatus struct {
ID uint64
CurTokenNum int32
MaxTokenNum int32
RunningDp []uint64
}
func newNodeSet(c *Cluster, id uint64, cap int, zoneName string) *nodeSet {
log.LogInfof("action[newNodeSet] id[%v]", id)
ns := &nodeSet{
ID: id,
Capacity: cap,
zoneName: zoneName,
metaNodes: new(sync.Map),
dataNodes: new(sync.Map),
decommissionDataPartitionList: NewDecommissionDataPartitionList(c),
manualDecommissionDiskList: NewDecommissionDiskList(),
autoDecommissionDiskList: NewDecommissionDiskList(),
doneDecommissionDiskListTraverse: make(chan struct{}, 1),
startDecommissionDiskListTraverse: make(chan struct{}, 1),
dataNodeSelector: NewNodeSelector(DefaultNodeSelectorName, DataNodeType),
metaNodeSelector: NewNodeSelector(DefaultNodeSelectorName, MetaNodeType),
}
go ns.traverseDecommissionDisk(c)
return ns
}
func (ns *nodeSet) GetDataNodeSelector() string {
ns.dataNodeSelectorLock.RLock()
defer ns.dataNodeSelectorLock.RUnlock()
return ns.dataNodeSelector.GetName()
}
func (ns *nodeSet) SetDataNodeSelector(name string) {
ns.dataNodeSelectorLock.Lock()
defer ns.dataNodeSelectorLock.Unlock()
ns.dataNodeSelector = NewNodeSelector(name, DataNodeType)
}
func (ns *nodeSet) GetMetaNodeSelector() string {
ns.metaNodeSelectorLock.RLock()
defer ns.metaNodeSelectorLock.RUnlock()
return ns.metaNodeSelector.GetName()
}
func (ns *nodeSet) SetMetaNodeSelector(name string) {
ns.metaNodeSelectorLock.Lock()
defer ns.metaNodeSelectorLock.Unlock()
ns.metaNodeSelector = NewNodeSelector(name, MetaNodeType)
}
func (ns *nodeSet) metaNodeLen() (count int) {
ns.RLock()
defer ns.RUnlock()
ns.metaNodes.Range(func(key, value interface{}) bool {
count++
return true
})
return
}
func (ns *nodeSet) startDecommissionSchedule() {
ns.decommissionDataPartitionList.startTraverse()
ns.startDecommissionDiskListTraverse <- struct{}{}
}
func (ns *nodeSet) dataNodeLen() (count int) {
ns.RLock()
defer ns.RUnlock()
ns.dataNodes.Range(func(key, value interface{}) bool {
count++
return true
})
return
}
func (ns *nodeSet) putMetaNode(metaNode *MetaNode) {
ns.metaNodes.Store(metaNode.Addr, metaNode)
}
func (ns *nodeSet) deleteMetaNode(metaNode *MetaNode) {
ns.metaNodes.Delete(metaNode.Addr)
}
func (ns *nodeSet) canWriteForDataNode(replicaNum int) bool {
var count int
ns.dataNodes.Range(func(key, value interface{}) bool {
node := value.(*DataNode)
if node.isWriteAble() && node.dpCntInLimit() {
count++
}
if count >= replicaNum {
return false
}
return true
})
log.LogInfof("canWriteForDataNode zone[%v], ns[%v],count[%v], replicaNum[%v]",
ns.zoneName, ns.ID, count, replicaNum)
return count >= replicaNum
}
func (ns *nodeSet) canWriteForMetaNode(replicaNum int) bool {
var count int
ns.metaNodes.Range(func(key, value interface{}) bool {
node := value.(*MetaNode)
if node.isWritable() {
count++
}
if count >= replicaNum {
return false
}
return true
})
log.LogInfof("canWriteForMetaNode zone[%v], ns[%v],count[%v] replicaNum[%v]",
ns.zoneName, ns.ID, count, replicaNum)
return count >= replicaNum
}
func (ns *nodeSet) putDataNode(dataNode *DataNode) {
ns.dataNodes.Store(dataNode.Addr, dataNode)
}
func (ns *nodeSet) deleteDataNode(dataNode *DataNode) {
ns.dataNodes.Delete(dataNode.Addr)
}
func (ns *nodeSet) AddToDecommissionDataPartitionList(dp *DataPartition, c *Cluster) {
ns.decommissionDataPartitionList.Put(ns.ID, dp, c)
}
func (ns *nodeSet) UpdateMaxParallel(maxParallel int32) {
ns.decommissionDataPartitionList.updateMaxParallel(maxParallel)
log.LogDebugf("action[UpdateMaxParallel]nodeSet[%v] decommission limit update to [%v]", ns.ID, maxParallel)
atomic.StoreInt32(&ns.decommissionParallelLimit, maxParallel)
}
func (ns *nodeSet) UpdateDecommissionDiskFactor(factor float64) {
log.LogDebugf("action[UpdateDecommissionFactor]nodeSet[%v] decommission disk factor update to [%v]", ns.ID, factor)
ns.diskParallelFactorLk.Lock()
defer ns.diskParallelFactorLk.Unlock()
ns.decommissionDiskParallelFactor = factor
}
func (ns *nodeSet) QueryDecommissionDiskLimit() int {
ns.diskParallelFactorLk.Lock()
defer ns.diskParallelFactorLk.Unlock()
log.LogDebugf("action[QueryDecommissionDiskLimit]nodeSet[%v] decommission disk limit to [%v]",
ns.ID, int(ns.decommissionDiskParallelFactor*float64(ns.dataNodeLen())))
return int(ns.decommissionDiskParallelFactor * float64(ns.dataNodeLen()))
}
func (ns *nodeSet) getDecommissionParallelStatus() (int32, int32, []uint64) {
return ns.decommissionDataPartitionList.getDecommissionParallelStatus()
}
func (ns *nodeSet) AcquireDecommissionToken(id uint64) bool {
return ns.decommissionDataPartitionList.acquireDecommissionToken(id)
}
func (ns *nodeSet) ReleaseDecommissionToken(id uint64) {
ns.decommissionDataPartitionList.releaseDecommissionToken(id)
}
func (ns *nodeSet) AddDecommissionDisk(dd *DecommissionDisk) {
ns.DecommissionDisks.Store(dd.GenerateKey(), dd)
if dd.IsManualDecommissionDisk() {
ns.addManualDecommissionDisk(dd)
} else {
ns.addAutoDecommissionDisk(dd)
}
log.LogInfof("action[AddDecommissionDisk] add disk %v type %v to ns %v", dd.GenerateKey(), dd.Type, ns.ID)
}
func (ns *nodeSet) RemoveDecommissionDisk(dd *DecommissionDisk) {
ns.DecommissionDisks.Delete(dd.GenerateKey())
if dd.IsManualDecommissionDisk() {
ns.removeManualDecommissionDisk(dd)
} else {
ns.removeAutoDecommissionDisk(dd)
}
log.LogInfof("action[RemoveDecommissionDisk] remove disk %v type %v from ns %v", dd.GenerateKey(), dd.Type, ns.ID)
}
func (ns *nodeSet) addManualDecommissionDisk(dd *DecommissionDisk) {
ns.manualDecommissionDiskList.Put(ns.ID, dd)
}
func (ns *nodeSet) addAutoDecommissionDisk(dd *DecommissionDisk) {
ns.autoDecommissionDiskList.Put(ns.ID, dd)
}
func (ns *nodeSet) removeManualDecommissionDisk(dd *DecommissionDisk) {
ns.manualDecommissionDiskList.Remove(ns.ID, dd)
}
func (ns *nodeSet) removeAutoDecommissionDisk(dd *DecommissionDisk) {
ns.autoDecommissionDiskList.Remove(ns.ID, dd)
}
func (ns *nodeSet) traverseDecommissionDisk(c *Cluster) {
t := time.NewTicker(DecommissionInterval)
// wait for loading all decommissionDisk when reload metadata
log.LogInfof("action[traverseDecommissionDisk]wait %v", ns.ID)
<-ns.startDecommissionDiskListTraverse
log.LogInfof("action[traverseDecommissionDisk] traverseDecommissionDisk start %v", ns.ID)
defer t.Stop()
for {
select {
case <-ns.doneDecommissionDiskListTraverse:
log.LogWarnf("traverse stopped")
return
case <-t.C:
if c.partition != nil && !c.partition.IsRaftLeader() {
log.LogWarnf("Leader changed, stop traverse!")
continue
}
runningCnt := 0
ns.DecommissionDisks.Range(func(key, value interface{}) bool {
disk := value.(*DecommissionDisk)
disk.updateDecommissionStatus(c, false)
status := disk.GetDecommissionStatus()
if status == DecommissionRunning {
runningCnt++
} else if status == DecommissionSuccess || status == DecommissionFail || status == DecommissionPause {
// remove from decommission disk list
log.LogWarnf("traverseDecommissionDisk remove disk %v status %v",
disk.GenerateKey(), disk.GetDecommissionStatus())
ns.RemoveDecommissionDisk(disk)
}
return true
})
ns.diskParallelFactorLk.Lock()
maxDiskDecommissionCnt := int(ns.decommissionDiskParallelFactor * float64(ns.dataNodeLen()))
ns.diskParallelFactorLk.Unlock()
if maxDiskDecommissionCnt == 0 && ns.dataNodeLen() != 0 {
manualCnt, manualDisks := ns.manualDecommissionDiskList.PopMarkDecommissionDisk(0)
log.LogDebugf("traverseDecommissionDisk traverse manualCnt %v",
manualCnt)
if manualCnt > 0 {
for _, disk := range manualDisks {
c.TryDecommissionDisk(disk)
}
}
if c.AutoDecommissionDiskIsEnabled() {
autoCnt, autoDisks := ns.autoDecommissionDiskList.PopMarkDecommissionDisk(0)
log.LogDebugf("traverseDecommissionDisk traverse autoCnt %v",
autoCnt)
if autoCnt > 0 {
for _, disk := range autoDisks {
c.TryDecommissionDisk(disk)
}
}
}
} else {
newDiskDecommissionCnt := maxDiskDecommissionCnt - runningCnt
log.LogDebugf("traverseDecommissionDisk traverse DiskDecommissionCnt %v",
newDiskDecommissionCnt)
if newDiskDecommissionCnt > 0 {
manualCnt, manualDisks := ns.manualDecommissionDiskList.PopMarkDecommissionDisk(newDiskDecommissionCnt)
log.LogDebugf("traverseDecommissionDisk traverse manualCnt %v",
manualCnt)
if manualCnt > 0 {
for _, disk := range manualDisks {
c.TryDecommissionDisk(disk)
}
}
if newDiskDecommissionCnt-manualCnt > 0 && c.AutoDecommissionDiskIsEnabled() {
autoCnt, autoDisks := ns.autoDecommissionDiskList.PopMarkDecommissionDisk(newDiskDecommissionCnt - manualCnt)
log.LogDebugf("traverseDecommissionDisk traverse autoCnt %v",
autoCnt)
if autoCnt > 0 {
for _, disk := range autoDisks {
c.TryDecommissionDisk(disk)
}
}
}
}
}
}
}
}
func (t *topology) isSingleZone() bool {
t.zoneLock.RLock()
defer t.zoneLock.RUnlock()
var zoneLen int
t.zoneMap.Range(func(zoneName, value interface{}) bool {
zoneLen++
return true
})
return zoneLen == 1
}
func (t *topology) getDomainExcludeZones() (zones []*Zone) {
t.zoneLock.RLock()
defer t.zoneLock.RUnlock()
zones = make([]*Zone, 0)
for i := 0; i < len(t.domainExcludeZones); i++ {
if value, ok := t.zoneMap.Load(t.domainExcludeZones[i]); ok {
zones = append(zones, value.(*Zone))
log.LogInfof("action[getDomainExcludeZones] append zone name:[%v]_[%v]", t.domainExcludeZones[i], value.(*Zone).name)
}
}
return
}
func (t *topology) getAllZones() (zones []*Zone) {
t.zoneLock.RLock()
defer t.zoneLock.RUnlock()
zones = make([]*Zone, 0)
t.zoneMap.Range(func(zoneName, value interface{}) bool {
zone := value.(*Zone)
zones = append(zones, zone)
return true
})
return
}
func (t *topology) getZoneByIndex(index int) (zone *Zone) {
t.zoneLock.RLock()
defer t.zoneLock.RUnlock()
return t.zones[index]
}
func (t *topology) getNodeSetByNodeSetId(nodeSetId uint64) (nodeSet *nodeSet, err error) {
zones := t.getAllZones()
for _, zone := range zones {
nodeSet, err = zone.getNodeSet(nodeSetId)
if err == nil {
return nodeSet, nil
}
}
return nil, errors.NewErrorf("set %v not found", nodeSetId)
}
func calculateDemandWriteNodes(zoneNum int, replicaNum int) (demandWriteNodes int) {
if zoneNum == 1 {
demandWriteNodes = replicaNum
} else {
if replicaNum == 1 {
demandWriteNodes = 1
} else {
demandWriteNodes = 2
}
}
return
}
func (t *topology) allocZonesForMetaNode(zoneNum, replicaNum int, excludeZone []string) (zones []*Zone, err error) {
if len(t.domainExcludeZones) > 0 {
zones = t.getDomainExcludeZones()
log.LogInfof("action[allocZonesForMetaNode] getDomainExcludeZones zones [%v]", t.domainExcludeZones)
} else {
// if domain enable, will not enter here
zones = t.getAllZones()
}
if t.isSingleZone() {
return zones, nil
}
if excludeZone == nil {
excludeZone = make([]string, 0)
}
candidateZones := make([]*Zone, 0)
demandWriteNodes := calculateDemandWriteNodes(zoneNum, replicaNum)
for i := 0; i < len(zones); i++ {
if t.zoneIndexForMetaNode >= len(zones) {
t.zoneIndexForMetaNode = 0
}
zone := zones[t.zoneIndexForMetaNode]
t.zoneIndexForMetaNode++
if zone.status == unavailableZone {
continue
}
if contains(excludeZone, zone.name) {
continue
}
if zone.canWriteForMetaNode(uint8(demandWriteNodes)) {
candidateZones = append(candidateZones, zone)
}
if len(candidateZones) >= zoneNum {
break
}
}
// if across zone,candidateZones must be larger than or equal with 2,otherwise,must have a candidate zone
if (zoneNum >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 {
log.LogError(fmt.Sprintf("action[allocZonesForMetaNode],reqZoneNum[%v],candidateZones[%v],demandWriteNodes[%v],err:%v",
zoneNum, len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateMetaPartition))
return nil, proto.ErrNoZoneToCreateMetaPartition
}
zones = candidateZones
err = nil
return
}
func (t *topology) allocZonesForDataNode(zoneNum, replicaNum int, excludeZone []string) (zones []*Zone, err error) {
// domain enabled and have old zones to be used
if len(t.domainExcludeZones) > 0 {
zones = t.getDomainExcludeZones()
} else {
// if domain enable, will not enter here
zones = t.getAllZones()
}
log.LogInfof("len(zones) = %v \n", len(zones))
if t.isSingleZone() {
return zones, nil
}
if excludeZone == nil {
excludeZone = make([]string, 0)
}
demandWriteNodes := calculateDemandWriteNodes(zoneNum, replicaNum)
candidateZones := make([]*Zone, 0)
for i := 0; i < len(zones); i++ {
if t.zoneIndexForDataNode >= len(zones) {
t.zoneIndexForDataNode = 0
}
zone := zones[t.zoneIndexForDataNode]
t.zoneIndexForDataNode++
if zone.status == unavailableZone {
continue
}
if contains(excludeZone, zone.name) {
continue
}
if zone.canWriteForDataNode(uint8(demandWriteNodes)) {
candidateZones = append(candidateZones, zone)
}
if len(candidateZones) >= zoneNum {
break
}
}
// if across zone,candidateZones must be larger than or equal with 2,otherwise,must have one candidate zone
if (zoneNum >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 {
log.LogError(fmt.Sprintf("action[allocZonesForDataNode],reqZoneNum[%v],candidateZones[%v],demandWriteNodes[%v],err:%v",
zoneNum, len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateDataPartition))
return nil, errors.NewError(proto.ErrNoZoneToCreateDataPartition)
}
zones = candidateZones
err = nil
return
}
func (ns *nodeSet) dataNodeCount() int {
var count int
ns.dataNodes.Range(func(key, value interface{}) bool {
count++
return true
})
return count
}
// Zone stores all the zone related information
type Zone struct {
name string
dataNodesetSelectorLock sync.RWMutex
dataNodesetSelector NodesetSelector
metaNodesetSelectorLock sync.RWMutex
metaNodesetSelector NodesetSelector
status int
dataNodes *sync.Map
metaNodes *sync.Map
nodeSetMap map[uint64]*nodeSet
nsLock sync.RWMutex
QosIopsRLimit uint64
QosIopsWLimit uint64
QosFlowRLimit uint64
QosFlowWLimit uint64
sync.RWMutex
}
type zoneValue struct {
Name string
QosIopsRLimit uint64
QosIopsWLimit uint64
QosFlowRLimit uint64
QosFlowWLimit uint64
DataNodesetSelector string
MetaNodesetSelector string
}
func newZone(name string) (zone *Zone) {
zone = &Zone{name: name}
zone.status = normalZone
zone.dataNodes = new(sync.Map)
zone.metaNodes = new(sync.Map)
zone.nodeSetMap = make(map[uint64]*nodeSet)
zone.dataNodesetSelector = NewNodesetSelector(DefaultNodesetSelectorName, DataNodeType)
zone.metaNodesetSelector = NewNodesetSelector(DefaultNodesetSelectorName, MetaNodeType)
return
}
func printZonesName(zones []*Zone) string {
str := "["
if len(zones) == 0 {
return str
}
for _, zone := range zones {
str = str + zone.name + ","
}
return str
}
func (zone *Zone) GetDataNodesetSelector() string {
zone.dataNodesetSelectorLock.RLock()
defer zone.dataNodesetSelectorLock.RUnlock()
return zone.dataNodesetSelector.GetName()
}
func (zone *Zone) SetDataNodesetSelector(name string) {
zone.dataNodesetSelectorLock.Lock()
defer zone.dataNodesetSelectorLock.Unlock()
zone.dataNodesetSelector = NewNodesetSelector(name, DataNodeType)
}
func (zone *Zone) GetMetaNodesetSelector() string {
zone.metaNodesetSelectorLock.RLock()
defer zone.metaNodesetSelectorLock.RUnlock()
return zone.metaNodesetSelector.GetName()
}
func (zone *Zone) SetMetaNodeSelector(name string) {
zone.metaNodesetSelectorLock.Lock()
defer zone.metaNodesetSelectorLock.Unlock()
zone.metaNodesetSelector = NewNodesetSelector(name, MetaNodeType)
}
func (zone *Zone) getFsmValue() *zoneValue {
return &zoneValue{
Name: zone.name,
QosIopsRLimit: zone.QosIopsRLimit,
QosIopsWLimit: zone.QosIopsWLimit,
QosFlowRLimit: zone.QosFlowRLimit,
QosFlowWLimit: zone.QosFlowWLimit,
DataNodesetSelector: zone.GetDataNodesetSelector(),
MetaNodesetSelector: zone.GetMetaNodesetSelector(),
}
}
func (zone *Zone) setStatus(status int) {
zone.status = status
}
func (zone *Zone) getStatus() int {
return zone.status
}
func (zone *Zone) getStatusToString() string {
if zone.status == normalZone {
return "available"
} else {
return "unavailable"
}
}
func (zone *Zone) isSingleNodeSet() bool {
zone.RLock()
defer zone.RUnlock()
return len(zone.nodeSetMap) == 1
}
func (zone *Zone) getNodeSet(setID uint64) (ns *nodeSet, err error) {
zone.nsLock.RLock()
defer zone.nsLock.RUnlock()
ns, ok := zone.nodeSetMap[setID]
if !ok {
return nil, errors.NewErrorf("set %v not found", setID)
}
return
}
func (zone *Zone) putNodeSet(ns *nodeSet) (err error) {
zone.nsLock.Lock()
defer zone.nsLock.Unlock()
if _, ok := zone.nodeSetMap[ns.ID]; ok {
return fmt.Errorf("nodeSet [%v] has exist", ns.ID)
}
zone.nodeSetMap[ns.ID] = ns
return
}
func (zone *Zone) createNodeSet(c *Cluster) (ns *nodeSet, err error) {
cnt := 1
allNodeSet := zone.getAllNodeSet()
log.LogInfof("action[createNodeSet] zone[%v] FaultDomain:[%v] init[%v] DefaultNormalZoneCnt[%v] nodeset cnt[%v]",
zone.name, c.FaultDomain, c.domainManager.init, c.cfg.DefaultNormalZoneCnt, len(allNodeSet))
if c.FaultDomain && c.domainManager.init && c.cfg.DefaultNormalZoneCnt < defaultReplicaNum {
if _, ok := c.domainManager.excludeZoneListDomain[zone.name]; !ok {
dstNsCnt := 0
if c.cfg.DefaultNormalZoneCnt == 1 { // one zone support domain need 3 nodeset at begin
dstNsCnt = 3
} else {
dstNsCnt = 2 // two zone construct domain need 2 nodeset for each
}
if len(allNodeSet) < dstNsCnt {
log.LogInfof("action[createNodeSet] zone[%v] nodeset len:[%v] less then 3,create to 3 one time",
zone.name, len(allNodeSet))
cnt = dstNsCnt - len(allNodeSet)
}
} else {
log.LogInfof("action[createNodeSet] zone[%v] get in excludeZoneListDomain", zone.name)
}
}
for {
if cnt == 0 {
break
}
cnt--
id, err := c.idAlloc.allocateCommonID()
if err != nil {
return nil, err
}
ns = newNodeSet(c, id, c.cfg.nodeSetCapacity, zone.name)
ns.UpdateMaxParallel(int32(c.DecommissionLimit))
ns.UpdateDecommissionDiskFactor(c.DecommissionDiskFactor)
ns.startDecommissionSchedule()
log.LogInfof("action[createNodeSet] syncAddNodeSet[%v] zonename[%v]", ns.ID, zone.name)
if err = c.syncAddNodeSet(ns); err != nil {
return nil, err
}
if err = zone.putNodeSet(ns); err != nil {
return nil, err
}
log.LogInfof("action[createNodeSet] nodeSet[%v]", ns.ID)
}
return
}
func (zone *Zone) getAllNodeSet() (nsc nodeSetCollection) {
zone.nsLock.RLock()
defer zone.nsLock.RUnlock()
nsc = make(nodeSetCollection, 0)
for _, ns := range zone.nodeSetMap {
nsc = append(nsc, ns)
}
return
}
func (zone *Zone) getAvailNodeSetForMetaNode() (nset *nodeSet) {
allNodeSet := zone.getAllNodeSet()
sort.Sort(sort.Reverse(allNodeSet))
for _, ns := range allNodeSet {
if ns.metaNodeLen() < ns.Capacity {
if nset == nil {
nset = ns
} else {
if nset.Capacity-nset.metaNodeLen() < ns.Capacity-ns.metaNodeLen() {
nset = ns
}
}
continue
}
}
return
}
func (zone *Zone) getAvailNodeSetForDataNode() (nset *nodeSet) {
allNodeSet := zone.getAllNodeSet()
for _, ns := range allNodeSet {
if ns.dataNodeLen() < ns.Capacity {
if nset == nil {
nset = ns
} else {
if nset.Capacity-nset.dataNodeLen() < ns.Capacity-ns.dataNodeLen() {
nset = ns
}
}
continue
}
}
return
}
func (zone *Zone) putDataNode(dataNode *DataNode) (err error) {
var ns *nodeSet
if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil {
log.LogErrorf("action[putDataNode] nodeSet[%v] not found", dataNode.NodeSetID)
return
}
ns.putDataNode(dataNode)
zone.dataNodes.Store(dataNode.Addr, dataNode)
return
}
func (zone *Zone) getDataNode(addr string) (dataNode *DataNode, err error) {
value, ok := zone.dataNodes.Load(addr)
if !ok {
return nil, errors.Trace(dataNodeNotFound(addr), "%v not found", addr)
}
dataNode = value.(*DataNode)
return
}
func (zone *Zone) deleteDataNode(dataNode *DataNode) {
ns, err := zone.getNodeSet(dataNode.NodeSetID)
if err != nil {
log.LogErrorf("action[zoneDeleteDataNode] nodeSet[%v] not found", dataNode.NodeSetID)
return
}
ns.deleteDataNode(dataNode)
zone.dataNodes.Delete(dataNode.Addr)
}
func (zone *Zone) putMetaNode(metaNode *MetaNode) (err error) {
var ns *nodeSet
if ns, err = zone.getNodeSet(metaNode.NodeSetID); err != nil {
log.LogErrorf("action[zonePutMetaNode] nodeSet[%v] not found", metaNode.NodeSetID)
return
}
ns.putMetaNode(metaNode)
zone.metaNodes.Store(metaNode.Addr, metaNode)
return
}
func (zone *Zone) deleteMetaNode(metaNode *MetaNode) (err error) {
ns, err := zone.getNodeSet(metaNode.NodeSetID)
if err != nil {
log.LogErrorf("action[zoneDeleteMetaNode] nodeSet[%v] not found", metaNode.NodeSetID)
return
}
ns.deleteMetaNode(metaNode)
zone.metaNodes.Delete(metaNode.Addr)
return
}
func (zone *Zone) allocNodeSetForDataNode(excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
nset := zone.getAllNodeSet()
if nset == nil {
return nil, errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
}
zone.nsLock.Lock()
defer zone.nsLock.Unlock()
// we need a read lock to block the modify of nodeset selector
zone.dataNodesetSelectorLock.RLock()
defer zone.dataNodesetSelectorLock.RUnlock()
ns, err = zone.dataNodesetSelector.Select(nset, excludeNodeSets, replicaNum)
if err != nil {
log.LogErrorf("action[allocNodeSetForDataNode],nset len[%v],excludeNodeSets[%v],rNum[%v] err:%v",
nset.Len(), excludeNodeSets, replicaNum, proto.ErrNoNodeSetToCreateDataPartition)
return nil, errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
}
return ns, nil
}
func (zone *Zone) allocNodeSetForMetaNode(excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
nset := zone.getAllNodeSet()
if nset == nil {
return nil, proto.ErrNoNodeSetToCreateMetaPartition
}
zone.nsLock.Lock()
defer zone.nsLock.Unlock()
// we need a read lock to block the modify of nodeset selector
zone.metaNodesetSelectorLock.RLock()
defer zone.metaNodesetSelectorLock.RUnlock()
ns, err = zone.metaNodesetSelector.Select(nset, excludeNodeSets, replicaNum)
if err != nil {
log.LogError(fmt.Sprintf("action[allocNodeSetForMetaNode],zone[%v],excludeNodeSets[%v],rNum[%v],err:%v",
zone.name, excludeNodeSets, replicaNum, proto.ErrNoNodeSetToCreateMetaPartition))
return nil, proto.ErrNoNodeSetToCreateMetaPartition
}
return ns, nil
}
func (zone *Zone) canWriteForDataNode(replicaNum uint8) (can bool) {
zone.RLock()
defer zone.RUnlock()
var leastAlive uint8
zone.dataNodes.Range(func(addr, value interface{}) bool {
dataNode := value.(*DataNode)
if !dataNode.dpCntInLimit() {
return true
}
if dataNode.isActive && dataNode.isWriteAbleWithSize(30*util.GB) {
leastAlive++
}
if leastAlive >= replicaNum {
can = true
return false
}
return true
})
log.LogInfof("canWriteForDataNode leastAlive[%v],replicaNum[%v],count[%v]\n", leastAlive, replicaNum, zone.dataNodeCount())
return
}
func (zone *Zone) isUsedRatio(ratio float64) (can bool) {
zone.RLock()
defer zone.RUnlock()
var (
dataNodeUsed uint64
dataNodeTotal uint64
metaNodeUsed uint64
metaNodeTotal uint64
)
zone.dataNodes.Range(func(addr, value interface{}) bool {
dataNode := value.(*DataNode)
if dataNode.isActive == true {
dataNodeUsed += dataNode.Used
} else {
dataNodeUsed += dataNode.Total
}
dataNodeTotal += dataNode.Total
return true
})
if float64(dataNodeUsed)/float64(dataNodeTotal) > ratio {
log.LogInfof("action[isUsedRatio] zone[%v] dataNodeUsed [%v] total [%v], ratio[%v]", zone.name, dataNodeUsed, dataNodeTotal, ratio)
return true
}
zone.metaNodes.Range(func(addr, value interface{}) bool {
metaNode := value.(*MetaNode)
if metaNode.IsActive == true && metaNode.isWritable() == true {
metaNodeUsed += metaNode.Used
} else {
metaNodeUsed += metaNode.Total
}
metaNodeTotal += metaNode.Total
return true
})
if float64(metaNodeUsed)/float64(metaNodeTotal) > ratio {
log.LogInfof("action[isUsedRatio] zone[%v] metaNodeUsed [%v] total [%v], ratio[%v]", zone.name, metaNodeUsed, metaNodeTotal, ratio)
return true
}
return false
}
func (zone *Zone) getDataUsed() (dataNodeUsed uint64, dataNodeTotal uint64) {
zone.RLock()
defer zone.RUnlock()
zone.dataNodes.Range(func(addr, value interface{}) bool {
dataNode := value.(*DataNode)
if dataNode.isActive == true {
dataNodeUsed += dataNode.Used
} else {
dataNodeUsed += dataNode.Total
}
dataNodeTotal += dataNode.Total
return true
})
return dataNodeUsed, dataNodeTotal
}
func (zone *Zone) getMetaUsed() (metaNodeUsed uint64, metaNodeTotal uint64) {
zone.RLock()
defer zone.RUnlock()
zone.metaNodes.Range(func(addr, value interface{}) bool {
metaNode := value.(*MetaNode)
if metaNode.IsActive == true && metaNode.isWritable() == true {
metaNodeUsed += metaNode.Used
} else {
metaNodeUsed += metaNode.Total
}
metaNodeTotal += metaNode.Total
return true
})
return metaNodeUsed, metaNodeTotal
}
func (zone *Zone) getSpaceLeft(dataType uint32) (spaceLeft uint64) {
if dataType == TypeDataPartition {
dataNodeUsed, dataNodeTotal := zone.getDataUsed()
return dataNodeTotal - dataNodeUsed
} else {
metaNodeUsed, metaNodeTotal := zone.getMetaUsed()
return metaNodeTotal - metaNodeUsed
}
}
func (zone *Zone) canWriteForMetaNode(replicaNum uint8) (can bool) {
zone.RLock()
defer zone.RUnlock()
var leastAlive uint8
zone.metaNodes.Range(func(addr, value interface{}) bool {
metaNode := value.(*MetaNode)
if metaNode.IsActive == true && metaNode.isWritable() == true {
leastAlive++
}
if leastAlive >= replicaNum {
can = true
return false
}
return true
})
return
}
func (zone *Zone) getDataNodeMaxTotal() (maxTotal uint64) {
zone.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if dataNode.Total > maxTotal {
maxTotal = dataNode.Total
}
return true
})
return
}
func (zone *Zone) getAvailNodeHosts(nodeType uint32, excludeNodeSets []uint64, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
if replicaNum == 0 {
return
}
log.LogDebugf("[x] get node host, zone(%s), nodeType(%d)", zone.name, nodeType)
if nodeType == TypeDataPartition {
ns, err := zone.allocNodeSetForDataNode(excludeNodeSets, uint8(replicaNum))
if err != nil {
return nil, nil, errors.Trace(err, "zone[%v] alloc node set,replicaNum[%v]", zone.name, replicaNum)
}
return ns.getAvailDataNodeHosts(excludeHosts, replicaNum)
}
ns, err := zone.allocNodeSetForMetaNode(excludeNodeSets, uint8(replicaNum))
if err != nil {
return nil, nil, errors.NewErrorf("zone[%v],err[%v]", zone.name, err)
}
return ns.getAvailMetaNodeHosts(excludeHosts, replicaNum)
}
func (zone *Zone) updateNodesetSelector(cluster *Cluster, dataNodesetSelector string, metaNodesetSelector string) error {
needSync := false
if dataNodesetSelector != "" && dataNodesetSelector != zone.GetDataNodesetSelector() {
needSync = true
zone.SetDataNodesetSelector(dataNodesetSelector)
}
if metaNodesetSelector != "" && metaNodesetSelector != zone.GetMetaNodesetSelector() {
needSync = true
zone.SetMetaNodeSelector(metaNodesetSelector)
}
if !needSync {
return nil
}
return cluster.sycnPutZoneInfo(zone)
}
func (zone *Zone) updateDataNodeQosLimit(cluster *Cluster, qosParam *qosArgs) error {
var err error
if qosParam.flowRVal > 0 {
zone.QosFlowRLimit = qosParam.flowRVal
}
if qosParam.flowWVal > 0 {
zone.QosFlowWLimit = qosParam.flowWVal
}
if qosParam.iopsRVal > 0 {
zone.QosIopsRLimit = qosParam.iopsRVal
}
if qosParam.iopsWVal > 0 {
zone.QosIopsWLimit = qosParam.iopsWVal
}
if err = cluster.sycnPutZoneInfo(zone); err != nil {
return err
}
zone.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if qosParam.flowRVal > 0 {
dataNode.QosFlowRLimit = qosParam.flowRVal
}
if qosParam.flowWVal > 0 {
dataNode.QosFlowWLimit = qosParam.flowWVal
}
if qosParam.iopsRVal > 0 {
dataNode.QosIopsRLimit = qosParam.iopsRVal
}
if qosParam.iopsWVal > 0 {
dataNode.QosIopsWLimit = qosParam.iopsWVal
}
return true
})
return nil
}
func (zone *Zone) loadDataNodeQosLimit() {
zone.dataNodes.Range(func(key, value interface{}) bool {
dataNode := value.(*DataNode)
if zone.QosFlowRLimit > 0 {
dataNode.QosFlowRLimit = zone.QosFlowRLimit
}
if zone.QosFlowWLimit > 0 {
dataNode.QosFlowWLimit = zone.QosFlowWLimit
}
if zone.QosIopsRLimit > 0 {
dataNode.QosIopsRLimit = zone.QosIopsRLimit
}
if zone.QosIopsWLimit > 0 {
dataNode.QosIopsWLimit = zone.QosIopsWLimit
}
return true
})
}
func (zone *Zone) dataNodeCount() (len int) {
zone.dataNodes.Range(func(key, value interface{}) bool {
len++
return true
})
return
}
func (zone *Zone) updateDecommissionLimit(limit int32, c *Cluster) (err error) {
nodeSets := zone.getAllNodeSet()
if nodeSets == nil {
log.LogWarnf("Nodeset form %v is nil", zone.name)
return proto.ErrNoNodeSetToUpdateDecommissionLimit
}
for _, ns := range nodeSets {
ns.UpdateMaxParallel(limit)
if err = c.syncUpdateNodeSet(ns); err != nil {
log.LogWarnf("UpdateMaxParallel nodeset [%v] failed,err:%v", ns.ID, err.Error())
continue
}
}
log.LogInfof("All nodeset from %v set decommission limit to %v", zone.name, limit)
return
}
func (zone *Zone) updateDecommissionDiskFactor(factor float64, c *Cluster) (err error) {
nodeSets := zone.getAllNodeSet()
if nodeSets == nil {
log.LogWarnf("Nodeset form %v is nil", zone.name)
return proto.ErrNoNodeSetToUpdateDecommissionDiskFactor
}
for _, ns := range nodeSets {
ns.UpdateDecommissionDiskFactor(factor)
if err = c.syncUpdateNodeSet(ns); err != nil {
log.LogWarnf("updateDecommissionDiskFactor nodeset [%v] failed,err:%v", ns.ID, err.Error())
continue
}
}
log.LogInfof("All nodeset from %v set decommission disk factor to %v", zone.name, factor)
return
}
func (zone *Zone) queryDecommissionDiskLimit() (err error, diskLimit []proto.DecommissionDiskLimitDetail) {
nodeSets := zone.getAllNodeSet()
diskLimit = make([]proto.DecommissionDiskLimitDetail, 0)
if nodeSets == nil {
log.LogWarnf("Nodeset form %v is nil", zone.name)
return proto.ErrNoNodeSetToQueryDecommissionDiskLimit, nil
}
for _, ns := range nodeSets {
limit := ns.QueryDecommissionDiskLimit()
diskLimit = append(diskLimit, proto.DecommissionDiskLimitDetail{NodeSetId: ns.ID, Limit: limit})
}
log.LogInfof("All nodeset from %v set decommission disk limit %v", zone.name, diskLimit)
return
}
func (zone *Zone) queryDecommissionParallelStatus() (err error, stats []nodeSetDecommissionParallelStatus) {
nodeSets := zone.getAllNodeSet()
if nodeSets == nil {
log.LogWarnf("Nodeset form %v is nil", zone.name)
return proto.ErrNoNodeSetToQueryDecommissionLimitStatus, stats
}
for _, ns := range nodeSets {
curToken, maxToken, dps := ns.getDecommissionParallelStatus()
stat := nodeSetDecommissionParallelStatus{
ID: ns.ID,
CurTokenNum: curToken,
MaxTokenNum: maxToken,
RunningDp: dps,
}
stats = append(stats, stat)
}
log.LogInfof("All nodeset from %v decommission limit status %v", zone.name, stats)
return
}
func (zone *Zone) startDecommissionListTraverse(c *Cluster) (err error) {
nodeSets := zone.getAllNodeSet()
log.LogDebugf("startDecommissionListTraverse nodeSets len %v ", len(nodeSets))
if len(nodeSets) == 0 {
log.LogWarnf("action[startDecommissionListTraverse] Nodeset form %v is nil", zone.name)
return nil
}
for _, ns := range nodeSets {
log.LogInfof("action[startDecommissionListTraverse] ns[%v] from zone %v", ns.ID, zone.name)
ns.startDecommissionSchedule()
}
log.LogInfof("action[startDecommissionListTraverse] All nodeset from %v start decommission schedule", zone.name)
return
}
type DecommissionDataPartitionList struct {
mu sync.Mutex
cacheMap map[uint64]*list.Element
decommissionList *list.List
done chan struct{}
parallelLimit int32
curParallel int32
start chan struct{}
runningMap map[uint64]struct{}
}
type DecommissionDataPartitionListValue struct {
DecommissionDataPartitionCacheValue
ParallelLimit int32
CurParallel int32
}
type DecommissionDataPartitionCacheValue struct {
CacheMap []dataPartitionValue
Status uint32
}
const DecommissionInterval = 5 * time.Second
func NewDecommissionDataPartitionList(c *Cluster) *DecommissionDataPartitionList {
l := new(DecommissionDataPartitionList)
l.mu = sync.Mutex{}
l.cacheMap = make(map[uint64]*list.Element)
l.done = make(chan struct{}, 1)
l.start = make(chan struct{}, 1)
l.decommissionList = list.New()
l.runningMap = make(map[uint64]struct{})
atomic.StoreInt32(&l.curParallel, 0)
atomic.StoreInt32(&l.parallelLimit, defaultDecommissionParallelLimit)
go l.traverse(c)
return l
}
// reserved
func (l *DecommissionDataPartitionList) Stop() {
l.done <- struct{}{}
}
func (l *DecommissionDataPartitionList) Length() int {
l.mu.Lock()
defer l.mu.Unlock()
return l.decommissionList.Len()
}
func (l *DecommissionDataPartitionList) Put(id uint64, value *DataPartition, c *Cluster) {
if value == nil {
log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] cannot put nil value", id)
return
}
// can only add running or mark or prepare
if !value.canAddToDecommissionList() {
log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] put wrong dp[%v] status[%v]",
id, value.PartitionID, value.GetDecommissionStatus())
return
}
// prepare status reset to mark status to retry again
if value.GetDecommissionStatus() == DecommissionPrepare {
value.SetDecommissionStatus(markDecommission)
}
l.mu.Lock()
if _, ok := l.cacheMap[value.PartitionID]; ok {
l.mu.Unlock()
return
}
elm := l.decommissionList.PushBack(value)
l.cacheMap[value.PartitionID] = elm
l.mu.Unlock()
// restore from rocksdb
if value.checkConsumeToken() {
value.TryAcquireDecommissionToken(c)
}
log.LogInfof("action[DecommissionDataPartitionListPut] ns[%v] add dp[%v] status[%v] isRecover[%v]",
id, value.PartitionID, value.GetDecommissionStatus(), value.isRecover)
}
func (l *DecommissionDataPartitionList) Remove(value *DataPartition) {
if value == nil {
log.LogWarnf("Cannot remove nil value")
return
}
l.mu.Lock()
defer l.mu.Unlock()
if elm, ok := l.cacheMap[value.PartitionID]; ok {
delete(l.cacheMap, value.PartitionID)
l.decommissionList.Remove(elm)
log.LogDebugf("Remove dp[%v]", value.PartitionID)
}
}
func (l *DecommissionDataPartitionList) getDecommissionParallelStatus() (int32, int32, []uint64) {
l.mu.Lock()
defer l.mu.Unlock()
dps := make([]uint64, 0)
for id := range l.runningMap {
dps = append(dps, id)
}
return atomic.LoadInt32(&l.curParallel), atomic.LoadInt32(&l.parallelLimit), dps
}
func (l *DecommissionDataPartitionList) updateMaxParallel(maxParallel int32) {
atomic.StoreInt32(&l.parallelLimit, maxParallel)
}
func (l *DecommissionDataPartitionList) acquireDecommissionToken(id uint64) bool {
if atomic.LoadInt32(&l.parallelLimit) == 0 {
l.mu.Lock()
l.runningMap[id] = struct{}{}
atomic.StoreInt32(&l.curParallel, int32(len(l.runningMap)))
l.mu.Unlock()
return true
}
if atomic.LoadInt32(&l.curParallel) >= atomic.LoadInt32(&l.parallelLimit) {
return false
}
l.mu.Lock()
l.runningMap[id] = struct{}{}
atomic.StoreInt32(&l.curParallel, int32(len(l.runningMap)))
l.mu.Unlock()
return true
}
func (l *DecommissionDataPartitionList) releaseDecommissionToken(id uint64) {
l.mu.Lock()
defer l.mu.Unlock()
if _, ok := l.runningMap[id]; !ok {
return
}
delete(l.runningMap, id)
atomic.StoreInt32(&l.curParallel, int32(len(l.runningMap)))
}
func (l *DecommissionDataPartitionList) GetAllDecommissionDataPartitions() (collection []*DataPartition) {
l.mu.Lock()
defer l.mu.Unlock()
collection = make([]*DataPartition, 0, l.decommissionList.Len())
for elm := l.decommissionList.Front(); elm != nil; elm = elm.Next() {
collection = append(collection, elm.Value.(*DataPartition))
}
return collection
}
func (l *DecommissionDataPartitionList) startTraverse() {
l.start <- struct{}{}
}
func (l *DecommissionDataPartitionList) traverse(c *Cluster) {
t := time.NewTicker(DecommissionInterval)
// wait for loading all ap when reload metadata
<-l.start
defer t.Stop()
for {
select {
case <-l.done:
log.LogWarnf("traverse stopped!")
return
case <-t.C:
if c.partition != nil && !c.partition.IsRaftLeader() {
log.LogWarnf("Leader changed, stop traverse!")
continue
}
allDecommissionDP := l.GetAllDecommissionDataPartitions()
for _, dp := range allDecommissionDP {
if dp.IsDecommissionSuccess() {
log.LogDebugf("action[DecommissionListTraverse]Remove dp[%v] for success",
dp.PartitionID)
l.Remove(dp)
dp.ReleaseDecommissionToken(c)
dp.ResetDecommissionStatus()
c.syncUpdateDataPartition(dp)
} else if dp.IsDecommissionFailed() {
if !dp.tryRollback(c) {
dp.restoreReplica(c)
log.LogDebugf("action[DecommissionListTraverse]Remove dp[%v] for fail",
dp.PartitionID)
l.Remove(dp)
}
// rollback fail/success need release token
dp.ReleaseDecommissionToken(c)
} else if dp.IsDecommissionPaused() {
log.LogDebugf("action[DecommissionListTraverse]Remove dp[%v] for paused ",
dp.PartitionID)
dp.ReleaseDecommissionToken(c)
l.Remove(dp)
} else if dp.IsDecommissionInitial() { // fixed done ,not release token
l.Remove(dp)
dp.ResetDecommissionStatus()
c.syncUpdateDataPartition(dp)
} else if dp.IsMarkDecommission() && dp.TryAcquireDecommissionToken(c) {
// TODO: decommission in here
go func(dp *DataPartition) {
if !dp.TryToDecommission(c) {
// retry should release token
if dp.IsMarkDecommission() {
dp.ReleaseDecommissionToken(c)
}
}
}(dp) // special replica cnt cost some time from prepare to running
}
}
}
}
}
type DecommissionDiskList struct {
mu sync.Mutex
cacheMap map[string]*list.Element
decommissionList *list.List
}
func NewDecommissionDiskList() *DecommissionDiskList {
l := new(DecommissionDiskList)
l.mu = sync.Mutex{}
l.cacheMap = make(map[string]*list.Element)
l.decommissionList = list.New()
return l
}
func (l *DecommissionDiskList) Put(nsId uint64, value *DecommissionDisk) {
if value == nil {
log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] cannot put nil value", nsId)
return
}
// can only add running or mark
if !value.canAddToDecommissionList() {
log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] put wrong disk[%v] status[%v]",
nsId, value.GenerateKey(), value.GetDecommissionStatus())
return
}
l.mu.Lock()
defer l.mu.Unlock()
if _, ok := l.cacheMap[value.GenerateKey()]; ok {
return
}
elm := l.decommissionList.PushBack(value)
l.cacheMap[value.GenerateKey()] = elm
log.LogDebugf("action[DecommissionDataPartitionListPut] ns[%v] add disk[%v] status[%v] type[%v]",
nsId, value.GenerateKey(), value.GetDecommissionStatus(), value.Type)
}
func (l *DecommissionDiskList) Remove(nsId uint64, value *DecommissionDisk) {
if value == nil {
log.LogWarnf("action[DecommissionDataPartitionListRemove] ns[%v]Cannot remove nil value", nsId)
return
}
l.mu.Lock()
defer l.mu.Unlock()
if elm, ok := l.cacheMap[value.GenerateKey()]; ok {
delete(l.cacheMap, value.GenerateKey())
l.decommissionList.Remove(elm)
log.LogDebugf("action[DecommissionDataPartitionListRemove] ns[%v] remove disk[%v]", nsId, value.GenerateKey())
}
}
func (l *DecommissionDiskList) Length() int {
l.mu.Lock()
defer l.mu.Unlock()
return l.decommissionList.Len()
}
// only pop decommission disk with markDecommission status from front
func (l *DecommissionDiskList) PopMarkDecommissionDisk(limit int) (count int, collection []*DecommissionDisk) {
l.mu.Lock()
defer l.mu.Unlock()
collection = make([]*DecommissionDisk, count)
count = 0
for elm := l.decommissionList.Front(); elm != nil; elm = elm.Next() {
if count == limit && limit != 0 {
break
}
disk := elm.Value.(*DecommissionDisk)
if disk.GetDecommissionStatus() != markDecommission {
continue
}
collection = append(collection, disk)
count++
log.LogDebugf("action[PopMarkDecommissionDisk] pop disk[%v]", disk)
}
return count, collection
}
package master
import (
"crypto/sha1"
"encoding/hex"
"io"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
const (
accessKeyLength = 16
secretKeyLength = 32
RootUserID = "root"
DefaultRootPasswd = "CubeFSRoot"
DefaultUserPassword = "CubeFSUser"
)
type User struct {
fsm *MetadataFsm
partition raftstore.Partition
userStore sync.Map // K: userID, V: UserInfo
AKStore sync.Map // K: ak, V: userID
volUser sync.Map // K: vol, V: userIDs
userStoreMutex sync.RWMutex
AKStoreMutex sync.RWMutex
volUserMutex sync.RWMutex
}
func newUser(fsm *MetadataFsm, partition raftstore.Partition) (u *User) {
u = new(User)
u.fsm = fsm
u.partition = partition
return
}
func (u *User) createKey(param *proto.UserCreateParam) (userInfo *proto.UserInfo, err error) {
var (
AKUser *proto.AKUser
userPolicy *proto.UserPolicy
exist bool
)
if param.ID == "" {
err = proto.ErrInvalidUserID
return
}
if !param.Type.Valid() {
err = proto.ErrInvalidUserType
return
}
userID := param.ID
password := param.Password
if password == "" {
password = DefaultUserPassword
}
accessKey := param.AccessKey
if accessKey == "" {
accessKey = util.RandomString(accessKeyLength, util.Numeric|util.LowerLetter|util.UpperLetter)
} else {
if !proto.IsValidAK(accessKey) {
err = proto.ErrInvalidAccessKey
return
}
}
secretKey := param.SecretKey
if secretKey == "" {
secretKey = util.RandomString(secretKeyLength, util.Numeric|util.LowerLetter|util.UpperLetter)
} else {
if !proto.IsValidSK(secretKey) {
err = proto.ErrInvalidSecretKey
return
}
}
userType := param.Type
description := param.Description
u.userStoreMutex.Lock()
defer u.userStoreMutex.Unlock()
u.AKStoreMutex.Lock()
defer u.AKStoreMutex.Unlock()
// check duplicate
if _, exist = u.userStore.Load(userID); exist {
err = proto.ErrDuplicateUserID
return
}
_, exist = u.AKStore.Load(accessKey)
for exist {
accessKey = util.RandomString(accessKeyLength, util.Numeric|util.LowerLetter|util.UpperLetter)
_, exist = u.AKStore.Load(accessKey)
}
userPolicy = proto.NewUserPolicy()
userInfo = &proto.UserInfo{
UserID: userID, AccessKey: accessKey, SecretKey: secretKey, Policy: userPolicy,
UserType: userType, CreateTime: time.Unix(time.Now().Unix(), 0).Format(proto.TimeFormat), Description: description,
}
AKUser = &proto.AKUser{AccessKey: accessKey, UserID: userID, Password: encodingPassword(password)}
if err = u.syncAddUserInfo(userInfo); err != nil {
return
}
if err = u.syncAddAKUser(AKUser); err != nil {
return
}
u.userStore.Store(userID, userInfo)
u.AKStore.Store(accessKey, AKUser)
return
}
func (u *User) deleteKey(userID string) (err error) {
var (
akUser *proto.AKUser
userInfo *proto.UserInfo
)
u.userStoreMutex.Lock()
defer u.userStoreMutex.Unlock()
u.AKStoreMutex.Lock()
defer u.AKStoreMutex.Unlock()
if value, exist := u.userStore.Load(userID); !exist {
err = proto.ErrUserNotExists
return
} else {
userInfo = value.(*proto.UserInfo)
}
userInfo.Mu.Lock()
defer userInfo.Mu.Unlock()
if len(userInfo.Policy.OwnVols) > 0 {
err = proto.ErrOwnVolExists
return
}
if userInfo.UserType == proto.UserTypeRoot {
err = proto.ErrNoPermission
return
}
if akUser, err = u.getAKUser(userInfo.AccessKey); err != nil {
return
}
if err = u.syncDeleteUserInfo(userInfo); err != nil {
return
}
if err = u.syncDeleteAKUser(akUser); err != nil {
return
}
u.userStore.Delete(userID)
u.AKStore.Delete(akUser.AccessKey)
// delete userID from related policy in volUserStore
u.removeUserFromAllVol(userID)
log.LogInfof("action[deleteUser], userID: %v, accesskey[%v]", userID, userInfo.AccessKey)
return
}
func (u *User) updateKey(param *proto.UserUpdateParam) (userInfo *proto.UserInfo, err error) {
if param.UserID == "" {
err = proto.ErrInvalidUserID
return
}
u.userStoreMutex.Lock()
defer u.userStoreMutex.Unlock()
u.AKStoreMutex.Lock()
defer u.AKStoreMutex.Unlock()
if value, exist := u.userStore.Load(param.UserID); !exist {
err = proto.ErrUserNotExists
return
} else {
userInfo = value.(*proto.UserInfo)
}
userInfo.Mu.Lock()
defer userInfo.Mu.Unlock()
if userInfo.UserType == proto.UserTypeRoot {
err = proto.ErrNoPermission
return
}
formerAK := userInfo.AccessKey
var akMark, skMark, typeMark, describeMark int
if param.AccessKey != "" {
if !proto.IsValidAK(param.AccessKey) {
err = proto.ErrInvalidAccessKey
return
}
if _, exist := u.AKStore.Load(param.AccessKey); exist {
err = proto.ErrDuplicateAccessKey
return
}
akMark = 1
}
if param.SecretKey != "" {
if !proto.IsValidSK(param.SecretKey) {
err = proto.ErrInvalidSecretKey
return
}
skMark = 1
}
// Type == 0,do not modify type
if param.Type != 0 {
if param.Type.Valid() {
typeMark = 1
} else {
err = proto.ErrInvalidUserType
return
}
}
if param.Description != "" {
describeMark = 1
}
var akUserBef *proto.AKUser
var akUserAft *proto.AKUser
if value, exist := u.AKStore.Load(formerAK); exist {
akUserBef = value.(*proto.AKUser)
} else {
err = proto.ErrAccessKeyNotExists
return
}
if akMark == 1 {
userInfo.AccessKey = param.AccessKey
}
if skMark == 1 {
userInfo.SecretKey = param.SecretKey
}
if typeMark == 1 {
userInfo.UserType = param.Type
}
if describeMark == 1 {
userInfo.Description = param.Description
}
if len(strings.TrimSpace(param.Password)) != 0 {
akUserBef.Password = encodingPassword(param.Password)
}
akUserAft = &proto.AKUser{AccessKey: userInfo.AccessKey, UserID: param.UserID, Password: akUserBef.Password}
if err = u.syncUpdateUserInfo(userInfo); err != nil {
return
}
if err = u.syncDeleteAKUser(akUserBef); err != nil {
return
}
if err = u.syncAddAKUser(akUserAft); err != nil {
return
}
u.AKStore.Delete(formerAK)
u.AKStore.Store(akUserAft.AccessKey, akUserAft)
log.LogInfof("action[updateUser], userID: %v, accesskey[%v], secretkey[%v]", userInfo.UserID, userInfo.AccessKey, userInfo.SecretKey)
return
}
func (u *User) getKeyInfo(ak string) (userInfo *proto.UserInfo, err error) {
var akUser *proto.AKUser
if akUser, err = u.getAKUser(ak); err != nil {
return
}
if userInfo, err = u.getUserInfo(akUser.UserID); err != nil {
return
}
log.LogInfof("action[getKeyInfo], accesskey[%v]", ak)
return
}
func (u *User) getUserInfo(userID string) (userInfo *proto.UserInfo, err error) {
if value, exist := u.userStore.Load(userID); exist {
userInfo = value.(*proto.UserInfo)
} else {
err = proto.ErrUserNotExists
return
}
log.LogInfof("action[getUserInfo], userID: %v", userID)
return
}
func (u *User) updatePolicy(params *proto.UserPermUpdateParam) (userInfo *proto.UserInfo, err error) {
if userInfo, err = u.getUserInfo(params.UserID); err != nil {
return
}
userInfo.Mu.Lock()
defer userInfo.Mu.Unlock()
if userInfo.Policy.IsOwn(params.Volume) {
err = proto.ErrIsOwner
return
}
userInfo.Policy.AddAuthorizedVol(params.Volume, params.Policy)
if err = u.syncUpdateUserInfo(userInfo); err != nil {
err = proto.ErrPersistenceByRaft
return
}
if err = u.addUserToVol(params.UserID, params.Volume); err != nil {
return
}
log.LogInfof("action[updatePolicy], userID: %v, volume: %v", params.UserID, params.Volume)
return
}
func (u *User) removePolicy(params *proto.UserPermRemoveParam) (userInfo *proto.UserInfo, err error) {
if userInfo, err = u.getUserInfo(params.UserID); err != nil {
return
}
userInfo.Mu.Lock()
defer userInfo.Mu.Unlock()
if userInfo.Policy.IsOwn(params.Volume) {
err = proto.ErrIsOwner
return
}
userInfo.Policy.RemoveAuthorizedVol(params.Volume)
if err = u.syncUpdateUserInfo(userInfo); err != nil {
err = proto.ErrPersistenceByRaft
return
}
if err = u.removeUserFromVol(params.UserID, params.Volume); err != nil {
return
}
log.LogInfof("action[removePolicy], userID: %v, volume: %v", params.UserID, params.Volume)
return
}
func (u *User) addOwnVol(userID, volName string) (userInfo *proto.UserInfo, err error) {
if userInfo, err = u.getUserInfo(userID); err != nil {
return
}
userInfo.Mu.Lock()
defer userInfo.Mu.Unlock()
userInfo.Policy.AddOwnVol(volName)
userInfo.Policy.RemoveAuthorizedVol(volName)
if err = u.syncUpdateUserInfo(userInfo); err != nil {
err = proto.ErrPersistenceByRaft
return
}
if err = u.addUserToVol(userID, volName); err != nil {
return
}
log.LogInfof("action[addOwnVol], userID: %v, volume: %v", userID, volName)
return
}
func (u *User) removeOwnVol(userID, volName string) (userInfo *proto.UserInfo, err error) {
if userInfo, err = u.getUserInfo(userID); err != nil {
return
}
userInfo.Mu.Lock()
defer userInfo.Mu.Unlock()
userInfo.Policy.RemoveOwnVol(volName)
if err = u.syncUpdateUserInfo(userInfo); err != nil {
err = proto.ErrPersistenceByRaft
return
}
if err = u.removeUserFromVol(userID, volName); err != nil {
return
}
log.LogInfof("action[removeOwnVol], userID: %v, volume: %v", userID, volName)
return
}
func (u *User) deleteVolPolicy(volName string) (err error) {
var (
volUser *proto.VolUser
userInfo *proto.UserInfo
)
// delete policy
deletedUsers := make([]string, 0)
var userIDs []string
if userIDs, err = u.getUsersOfVol(volName); err != nil {
return
}
for _, userID := range userIDs {
if userInfo, err = u.getUserInfo(userID); err != nil {
if err == proto.ErrUserNotExists {
deletedUsers = append(deletedUsers, userID)
log.LogWarnf("action[deleteVolPolicy], userID: %v does not exist", userID)
continue
}
return
}
userInfo.Mu.Lock()
userInfo.Policy.RemoveOwnVol(volName)
userInfo.Policy.RemoveAuthorizedVol(volName)
if err = u.syncUpdateUserInfo(userInfo); err != nil {
err = proto.ErrPersistenceByRaft
userInfo.Mu.Unlock()
return
}
userInfo.Mu.Unlock()
}
// delete volName index
if value, exist := u.volUser.Load(volName); exist {
volUser = value.(*proto.VolUser)
} else {
return nil
}
volUser.Mu.Lock()
if err = u.syncDeleteVolUser(volUser); err != nil {
volUser.Mu.Unlock()
return
}
u.volUser.Delete(volUser.Vol)
volUser.Mu.Unlock()
for _, deletedUser := range deletedUsers {
u.removeUserFromAllVol(deletedUser)
}
log.LogInfof("action[deleteVolPolicy], volName: %v", volName)
return
}
func (u *User) transferVol(params *proto.UserTransferVolParam) (targetUserInfo *proto.UserInfo, err error) {
var userInfo *proto.UserInfo
userInfo, err = u.getUserInfo(params.UserSrc)
if (err != nil && err != proto.ErrUserNotExists) || (!params.Force && err == proto.ErrUserNotExists) {
return
}
if err == nil {
isOwned := userInfo.Policy.IsOwn(params.Volume)
if !isOwned && !params.Force && params.UserSrc != params.UserDst {
err = proto.ErrHaveNoPolicy
return
}
if isOwned {
if _, err = u.removeOwnVol(params.UserSrc, params.Volume); err != nil {
return
}
}
}
if targetUserInfo, err = u.addOwnVol(params.UserDst, params.Volume); err != nil {
return
}
log.LogInfof("action[transferVol], volName: %v, userSrc: %v, userDst: %v", params.Volume, params.UserSrc, params.UserDst)
return
}
func (u *User) getAllUserInfo(keywords string) (users []*proto.UserInfo) {
users = make([]*proto.UserInfo, 0)
u.userStore.Range(func(key, value interface{}) bool {
userInfo := value.(*proto.UserInfo)
if strings.Contains(userInfo.UserID, keywords) {
users = append(users, userInfo)
}
return true
})
log.LogInfof("action[getAllUserInfo], keywords: %v, total numbers: %v", keywords, len(users))
return
}
func (u *User) getUsersOfVol(volName string) (userIDs []string, err error) {
var volUser *proto.VolUser
userIDs = make([]string, 0)
if value, exist := u.volUser.Load(volName); exist {
volUser = value.(*proto.VolUser)
} else {
err = proto.ErrHaveNoPolicy
return
}
volUser.Mu.RLock()
defer volUser.Mu.RUnlock()
for _, userID := range volUser.UserIDs {
userIDs = append(userIDs, userID)
}
log.LogInfof("action[getUsersOfVol], vol: %v, user numbers: %v", volName, len(userIDs))
return
}
func (u *User) getAKUser(ak string) (akUser *proto.AKUser, err error) {
if value, exist := u.AKStore.Load(ak); exist {
akUser = value.(*proto.AKUser)
} else {
err = proto.ErrAccessKeyNotExists
}
return
}
func (u *User) addUserToVol(userID, volName string) (err error) {
u.volUserMutex.Lock()
defer u.volUserMutex.Unlock()
var volUser *proto.VolUser
if value, ok := u.volUser.Load(volName); ok {
volUser = value.(*proto.VolUser)
volUser.Mu.Lock()
defer volUser.Mu.Unlock()
if contains(volUser.UserIDs, userID) {
return
}
volUser.UserIDs = append(volUser.UserIDs, userID)
} else {
volUser = &proto.VolUser{Vol: volName, UserIDs: []string{userID}}
u.volUser.Store(volName, volUser)
}
if err = u.syncAddVolUser(volUser); err != nil {
err = proto.ErrPersistenceByRaft
return
}
return
}
func (u *User) removeUserFromVol(userID, volName string) (err error) {
var volUser *proto.VolUser
if value, ok := u.volUser.Load(volName); ok {
volUser = value.(*proto.VolUser)
volUser.Mu.Lock()
defer volUser.Mu.Unlock()
volUser.UserIDs, _ = removeString(volUser.UserIDs, userID)
} else {
err = proto.ErrHaveNoPolicy
return
}
if err = u.syncUpdateVolUser(volUser); err != nil {
err = proto.ErrPersistenceByRaft
return
}
return
}
func (u *User) removeUserFromAllVol(userID string) {
u.volUser.Range(func(key, value interface{}) bool {
volUser := value.(*proto.VolUser)
volUser.Mu.Lock()
var exist bool
volUser.UserIDs, exist = removeString(volUser.UserIDs, userID)
if exist {
if err := u.syncUpdateVolUser(volUser); err != nil {
err = proto.ErrPersistenceByRaft
log.LogErrorf("action[deleteUser], userID: %v, volUser: %v, err: %v", userID, volUser, err)
}
}
volUser.Mu.Unlock()
return true
})
}
func removeString(array []string, element string) ([]string, bool) {
for k, v := range array {
if v == element {
return append(array[:k], array[k+1:]...), true
}
}
return array, false
}
func encodingPassword(s string) string {
t := sha1.New()
io.WriteString(t, s)
return hex.EncodeToString(t.Sum(nil))
}
func (u *User) clearUserStore() {
u.userStore.Range(func(key, value interface{}) bool {
u.userStore.Delete(key)
return true
})
}
func (u *User) clearAKStore() {
u.AKStore.Range(func(key, value interface{}) bool {
u.AKStore.Delete(key)
return true
})
}
func (u *User) clearVolUsers() {
u.volUser.Range(func(key, value interface{}) bool {
u.volUser.Delete(key)
return true
})
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
func (u *User) submit(metadata *RaftCmd) (err error) {
cmd, err := metadata.Marshal()
if err != nil {
return errors.New(err.Error())
}
if _, err = u.partition.Submit(cmd); err != nil {
msg := fmt.Sprintf("action[user_submit] err:%v", err.Error())
return errors.New(msg)
}
return
}
// key = #ak#accesskey, value = userInfo
func (u *User) syncAddUserInfo(userInfo *proto.UserInfo) (err error) {
return u.syncPutUserInfo(opSyncAddUserInfo, userInfo)
}
func (u *User) syncDeleteUserInfo(userInfo *proto.UserInfo) (err error) {
return u.syncPutUserInfo(opSyncDeleteUserInfo, userInfo)
}
func (u *User) syncUpdateUserInfo(userInfo *proto.UserInfo) (err error) {
return u.syncPutUserInfo(opSyncUpdateUserInfo, userInfo)
}
func (u *User) syncPutUserInfo(opType uint32, userInfo *proto.UserInfo) (err error) {
raftCmd := new(RaftCmd)
raftCmd.Op = opType
raftCmd.K = userPrefix + userInfo.UserID
raftCmd.V, err = json.Marshal(userInfo)
if err != nil {
return errors.New(err.Error())
}
return u.submit(raftCmd)
}
// key = #user#userid, value = userInfo
func (u *User) syncAddAKUser(akUser *proto.AKUser) (err error) {
return u.syncPutAKUser(opSyncAddAKUser, akUser)
}
func (u *User) syncDeleteAKUser(akUser *proto.AKUser) (err error) {
return u.syncPutAKUser(opSyncDeleteAKUser, akUser)
}
func (u *User) syncPutAKUser(opType uint32, akUser *proto.AKUser) (err error) {
userInfo := new(RaftCmd)
userInfo.Op = opType
userInfo.K = akPrefix + akUser.AccessKey
userInfo.V, err = json.Marshal(akUser)
if err != nil {
return errors.New(err.Error())
}
return u.submit(userInfo)
}
// key = #voluser#volname, value = userIDs
func (u *User) syncAddVolUser(volUser *proto.VolUser) (err error) {
return u.syncPutVolUser(opSyncAddVolUser, volUser)
}
func (u *User) syncDeleteVolUser(volUser *proto.VolUser) (err error) {
return u.syncPutVolUser(opSyncDeleteVolUser, volUser)
}
func (u *User) syncUpdateVolUser(volUser *proto.VolUser) (err error) {
return u.syncPutVolUser(opSyncUpdateVolUser, volUser)
}
func (u *User) syncPutVolUser(opType uint32, volUser *proto.VolUser) (err error) {
userInfo := new(RaftCmd)
userInfo.Op = opType
userInfo.K = volUserPrefix + volUser.Vol
userInfo.V, err = json.Marshal(volUser)
if err != nil {
return errors.New(err.Error())
}
return u.submit(userInfo)
}
func (u *User) loadUserStore() (err error) {
result, err := u.fsm.store.SeekForPrefix([]byte(userPrefix))
if err != nil {
err = fmt.Errorf("action[loadUserKeyInfo], err: %v", err.Error())
return err
}
for _, value := range result {
userInfo := &proto.UserInfo{}
if err = json.Unmarshal(value, userInfo); err != nil {
err = fmt.Errorf("action[loadUserKeyInfo], unmarshal err: %v", err.Error())
return err
}
u.userStore.Store(userInfo.UserID, userInfo)
log.LogInfof("action[loadUserKeyInfo], userID[%v]", userInfo.UserID)
}
return
}
func (u *User) loadAKStore() (err error) {
result, err := u.fsm.store.SeekForPrefix([]byte(akPrefix))
if err != nil {
err = fmt.Errorf("action[loadAKStore], err: %v", err.Error())
return err
}
for _, value := range result {
akUser := &proto.AKUser{}
if err = json.Unmarshal(value, akUser); err != nil {
err = fmt.Errorf("action[loadAKStore], unmarshal err: %v", err.Error())
return err
}
u.AKStore.Store(akUser.AccessKey, akUser)
log.LogInfof("action[loadAKStore], ak[%v], userID[%v]", akUser.AccessKey, akUser.UserID)
}
return
}
func (u *User) loadVolUsers() (err error) {
result, err := u.fsm.store.SeekForPrefix([]byte(volUserPrefix))
if err != nil {
err = fmt.Errorf("action[loadVolUsers], err: %v", err.Error())
return err
}
for _, value := range result {
volUser := &proto.VolUser{}
if err = json.Unmarshal(value, volUser); err != nil {
err = fmt.Errorf("action[loadVolUsers], unmarshal err: %v", err.Error())
return err
}
u.volUser.Store(volUser.Vol, volUser)
log.LogInfof("action[loadVolUsers], vol[%v]", volUser.Vol)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"math"
"runtime/debug"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
type VolVarargs struct {
zoneName string
description string
capacity uint64 // GB
deleteLockTime int64 // h
followerRead bool
authenticate bool
dpSelectorName string
dpSelectorParm string
coldArgs *coldVolArgs
domainId uint64
dpReplicaNum uint8
enablePosixAcl bool
dpReadOnlyWhenVolFull bool
enableQuota bool
enableTransaction proto.TxOpMask
txTimeout int64
txConflictRetryNum int64
txConflictRetryInterval int64
txOpLimit int
}
// Vol represents a set of meta partitionMap and data partitionMap
type Vol struct {
ID uint64
Name string
Owner string
OSSAccessKey string
OSSSecretKey string
dpReplicaNum uint8
mpReplicaNum uint8
Status uint8
threshold float32
dataPartitionSize uint64 // byte
Capacity uint64 // GB
VolType int
EbsBlkSize int
CacheCapacity uint64
CacheAction int
CacheThreshold int
CacheTTL int
CacheHighWater int
CacheLowWater int
CacheLRUInterval int
CacheRule string
PreloadCacheOn bool
NeedToLowerReplica bool
FollowerRead bool
authenticate bool
crossZone bool
domainOn bool
defaultPriority bool // old default zone first
enablePosixAcl bool
enableTransaction proto.TxOpMask
txTimeout int64
txConflictRetryNum int64
txConflictRetryInterval int64
txOpLimit int
zoneName string
MetaPartitions map[uint64]*MetaPartition `graphql:"-"`
dataPartitions *DataPartitionMap
mpsCache []byte
viewCache []byte
createDpMutex sync.RWMutex
createMpMutex sync.RWMutex
createTime int64
DeleteLockTime int64
description string
dpSelectorName string
dpSelectorParm string
domainId uint64
qosManager *QosCtrlManager
DpReadOnlyWhenVolFull bool
aclMgr AclManager
uidSpaceManager *UidSpaceManager
volLock sync.RWMutex
quotaManager *MasterQuotaManager
enableQuota bool
VersionMgr *VolVersionManager
Forbidden bool
mpsLock *mpsLockManager
EnableAuditLog bool
preloadCapacity uint64
}
func newVol(vv volValue) (vol *Vol) {
vol = &Vol{ID: vv.ID, Name: vv.Name, MetaPartitions: make(map[uint64]*MetaPartition, 0)}
if vol.threshold <= 0 {
vol.threshold = defaultMetaPartitionMemUsageThreshold
}
vol.dataPartitions = newDataPartitionMap(vv.Name)
vol.VersionMgr = newVersionMgr(vol)
vol.dpReplicaNum = vv.DpReplicaNum
vol.mpReplicaNum = vv.ReplicaNum
vol.Owner = vv.Owner
vol.dataPartitionSize = vv.DataPartitionSize
vol.Capacity = vv.Capacity
vol.FollowerRead = vv.FollowerRead
vol.authenticate = vv.Authenticate
vol.crossZone = vv.CrossZone
vol.zoneName = vv.ZoneName
vol.viewCache = make([]byte, 0)
vol.mpsCache = make([]byte, 0)
vol.createTime = vv.CreateTime
vol.DeleteLockTime = vv.DeleteLockTime
vol.description = vv.Description
vol.defaultPriority = vv.DefaultPriority
vol.domainId = vv.DomainId
vol.enablePosixAcl = vv.EnablePosixAcl
vol.enableQuota = vv.EnableQuota
vol.enableTransaction = vv.EnableTransaction
vol.txTimeout = vv.TxTimeout
vol.txConflictRetryNum = vv.TxConflictRetryNum
vol.txConflictRetryInterval = vv.TxConflictRetryInterval
vol.txOpLimit = vv.TxOpLimit
vol.VolType = vv.VolType
vol.EbsBlkSize = vv.EbsBlkSize
vol.CacheCapacity = vv.CacheCapacity
vol.CacheAction = vv.CacheAction
vol.CacheThreshold = vv.CacheThreshold
vol.CacheTTL = vv.CacheTTL
vol.CacheHighWater = vv.CacheHighWater
vol.CacheLowWater = vv.CacheLowWater
vol.CacheLRUInterval = vv.CacheLRUInterval
vol.CacheRule = vv.CacheRule
vol.Status = vv.Status
limitQosVal := &qosArgs{
qosEnable: vv.VolQosEnable,
diskQosEnable: vv.DiskQosEnable,
iopsRVal: vv.IopsRLimit,
iopsWVal: vv.IopsWLimit,
flowRVal: vv.FlowRlimit,
flowWVal: vv.FlowWlimit,
}
vol.initQosManager(limitQosVal)
magnifyQosVal := &qosArgs{
iopsRVal: uint64(vv.IopsRMagnify),
iopsWVal: uint64(vv.IopsWMagnify),
flowRVal: uint64(vv.FlowWMagnify),
flowWVal: uint64(vv.FlowWMagnify),
}
vol.qosManager.volUpdateMagnify(magnifyQosVal)
vol.DpReadOnlyWhenVolFull = vv.DpReadOnlyWhenVolFull
vol.mpsLock = newMpsLockManager(vol)
vol.EnableAuditLog = true
vol.preloadCapacity = math.MaxUint64 // mark as special value to trigger calculate
return
}
func newVolFromVolValue(vv *volValue) (vol *Vol) {
vol = newVol(*vv)
// overwrite oss secure
vol.OSSAccessKey, vol.OSSSecretKey = vv.OSSAccessKey, vv.OSSSecretKey
vol.Status = vv.Status
vol.dpSelectorName = vv.DpSelectorName
vol.dpSelectorParm = vv.DpSelectorParm
if vol.txTimeout == 0 {
vol.txTimeout = proto.DefaultTransactionTimeout
}
if vol.txConflictRetryNum == 0 {
vol.txConflictRetryNum = proto.DefaultTxConflictRetryNum
}
if vol.txConflictRetryInterval == 0 {
vol.txConflictRetryInterval = proto.DefaultTxConflictRetryInterval
}
vol.Forbidden = vv.Forbidden
vol.EnableAuditLog = vv.EnableAuditLog
return vol
}
type mpsLockManager struct {
mpsLock sync.RWMutex
lastEffectStack string
lockTime time.Time
innerLock sync.RWMutex
onLock bool
hang bool
vol *Vol
enable int32 // only config debug log enable lock
}
var (
lockCheckInterval = time.Second
lockExpireInterval = time.Minute
)
func newMpsLockManager(vol *Vol) *mpsLockManager {
lc := &mpsLockManager{vol: vol}
go lc.CheckExceptionLock(lockCheckInterval, lockExpireInterval)
if log.EnableDebug() {
atomic.StoreInt32(&lc.enable, 0)
}
return lc
}
func (mpsLock *mpsLockManager) Lock() {
mpsLock.mpsLock.Lock()
if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
mpsLock.innerLock.Lock()
mpsLock.onLock = true
mpsLock.lockTime = time.Now()
mpsLock.lastEffectStack = fmt.Sprintf("Lock stack %v", string(debug.Stack()))
}
}
func (mpsLock *mpsLockManager) UnLock() {
mpsLock.mpsLock.Unlock()
if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
mpsLock.onLock = false
mpsLock.lockTime = time.Unix(0, 0)
mpsLock.lastEffectStack = fmt.Sprintf("UnLock stack %v", string(debug.Stack()))
mpsLock.innerLock.Unlock()
}
}
func (mpsLock *mpsLockManager) RLock() {
mpsLock.mpsLock.RLock()
if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
mpsLock.innerLock.RLock()
mpsLock.hang = false
mpsLock.onLock = true
mpsLock.lockTime = time.Now()
mpsLock.lastEffectStack = fmt.Sprintf("RLock stack %v", string(debug.Stack()))
}
}
func (mpsLock *mpsLockManager) RUnlock() {
mpsLock.mpsLock.RUnlock()
if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
mpsLock.onLock = false
mpsLock.hang = false
mpsLock.lockTime = time.Unix(0, 0)
mpsLock.lastEffectStack = fmt.Sprintf("RUnlock stack %v", string(debug.Stack()))
mpsLock.innerLock.RUnlock()
}
}
func (mpsLock *mpsLockManager) CheckExceptionLock(interval time.Duration, expireTime time.Duration) {
ticker := time.NewTicker(interval)
for {
select {
case <-ticker.C:
if mpsLock.vol.status() == proto.VolStatusMarkDelete || atomic.LoadInt32(&mpsLock.enable) == 0 {
break
}
if !log.EnableDebug() {
continue
}
if !mpsLock.onLock {
continue
}
tm := time.Now()
if tm.After(mpsLock.lockTime.Add(expireTime)) {
log.LogWarnf("vol %v mpsLock hang more than %v since time %v stack(%v)",
mpsLock.vol.Name, expireTime, mpsLock.lockTime, mpsLock.lastEffectStack)
mpsLock.hang = true
}
}
}
}
func (vol *Vol) CheckStrategy(c *Cluster) {
// make sure resume all the processing ver deleting tasks before checking
if !atomic.CompareAndSwapInt32(&vol.VersionMgr.checkStrategy, 0, 1) {
return
}
go func() {
waitTime := 5 * time.Second * defaultIntervalToCheck
waited := false
for {
time.Sleep(waitTime)
if vol.Status == proto.VolStatusMarkDelete {
break
}
if c != nil && c.IsLeader() {
if !waited {
log.LogInfof("wait for %v seconds once after becoming leader to make sure all the ver deleting tasks are resumed",
waitTime)
time.Sleep(waitTime)
waited = true
}
if !proto.IsHot(vol.VolType) {
return
}
vol.VersionMgr.RLock()
if vol.VersionMgr.strategy.GetPeriodicSecond() == 0 || vol.VersionMgr.strategy.Enable == false { // strategy not be set
vol.VersionMgr.RUnlock()
continue
}
vol.VersionMgr.RUnlock()
vol.VersionMgr.checkCreateStrategy(c)
vol.VersionMgr.checkDeleteStrategy(c)
}
}
}()
}
func (vol *Vol) CalculatePreloadCapacity() uint64 {
total := uint64(0)
dps := vol.dataPartitions.partitions
for _, dp := range dps {
if proto.IsPreLoadDp(dp.PartitionType) {
total += dp.total / util.GB
}
}
if overSoldFactor <= 0 {
return total
}
return uint64(float32(total) / overSoldFactor)
}
func (vol *Vol) getPreloadCapacity() uint64 {
if vol.preloadCapacity != math.MaxUint64 {
return vol.preloadCapacity
}
vol.preloadCapacity = vol.CalculatePreloadCapacity()
log.LogDebugf("[getPreloadCapacity] vol(%v) calculated preload capacity: %v", vol.Name, vol.preloadCapacity)
return vol.preloadCapacity
}
func (vol *Vol) initQosManager(limitArgs *qosArgs) {
vol.qosManager = &QosCtrlManager{
cliInfoMgrMap: make(map[uint64]*ClientInfoMgr, 0),
serverFactorLimitMap: make(map[uint32]*ServerFactorLimit, 0),
qosEnable: limitArgs.qosEnable,
vol: vol,
ClientHitTriggerCnt: defaultClientTriggerHitCnt,
ClientReqPeriod: defaultClientReqPeriodSeconds,
}
if limitArgs.iopsRVal == 0 {
limitArgs.iopsRVal = defaultIopsRLimit
}
if limitArgs.iopsWVal == 0 {
limitArgs.iopsWVal = defaultIopsWLimit
}
if limitArgs.flowRVal == 0 {
limitArgs.flowRVal = defaultFlowRLimit
}
if limitArgs.flowWVal == 0 {
limitArgs.flowWVal = defaultFlowWLimit
}
arrLimit := [defaultLimitTypeCnt]uint64{limitArgs.iopsRVal, limitArgs.iopsWVal, limitArgs.flowRVal, limitArgs.flowWVal}
arrType := [defaultLimitTypeCnt]uint32{proto.IopsReadType, proto.IopsWriteType, proto.FlowReadType, proto.FlowWriteType}
for i := 0; i < defaultLimitTypeCnt; i++ {
vol.qosManager.serverFactorLimitMap[arrType[i]] = &ServerFactorLimit{
Name: proto.QosTypeString(arrType[i]),
Type: arrType[i],
Total: arrLimit[i],
Buffer: arrLimit[i],
requestCh: make(chan interface{}, 10240),
qosManager: vol.qosManager,
}
go vol.qosManager.serverFactorLimitMap[arrType[i]].dispatch()
}
}
func (vol *Vol) refreshOSSSecure() (key, secret string) {
vol.OSSAccessKey = util.RandomString(16, util.Numeric|util.LowerLetter|util.UpperLetter)
vol.OSSSecretKey = util.RandomString(32, util.Numeric|util.LowerLetter|util.UpperLetter)
return vol.OSSAccessKey, vol.OSSSecretKey
}
func (vol *Vol) addMetaPartition(mp *MetaPartition) {
vol.mpsLock.Lock()
defer vol.mpsLock.UnLock()
if _, ok := vol.MetaPartitions[mp.PartitionID]; !ok {
vol.MetaPartitions[mp.PartitionID] = mp
return
}
// replace the old partition in the map with mp
vol.MetaPartitions[mp.PartitionID] = mp
}
func (vol *Vol) metaPartition(partitionID uint64) (mp *MetaPartition, err error) {
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
mp, ok := vol.MetaPartitions[partitionID]
if !ok {
err = proto.ErrMetaPartitionNotExists
}
return
}
func (vol *Vol) maxPartitionID() (maxPartitionID uint64) {
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
for id := range vol.MetaPartitions {
if id > maxPartitionID {
maxPartitionID = id
}
}
return
}
func (vol *Vol) getRWMetaPartitionNum() (num uint64, isHeartBeatDone bool) {
if time.Now().Unix()-vol.createTime <= defaultMetaPartitionTimeOutSec {
log.LogInfof("The vol[%v] is being created.", vol.Name)
return num, false
}
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
for _, mp := range vol.MetaPartitions {
if !mp.heartBeatDone {
log.LogInfof("The mp[%v] of vol[%v] is not done", mp.PartitionID, vol.Name)
return num, false
}
if mp.Status == proto.ReadWrite {
num++
} else {
log.LogWarnf("The mp[%v] of vol[%v] is not RW", mp.PartitionID, vol.Name)
}
}
return num, true
}
func (vol *Vol) getDataPartitionsView() (body []byte, err error) {
return vol.dataPartitions.updateResponseCache(false, 0, vol.VolType)
}
func (vol *Vol) getDataPartitionViewCompress() (body []byte, err error) {
return vol.dataPartitions.updateCompressCache(false, 0, vol.VolType)
}
func (vol *Vol) getDataPartitionByID(partitionID uint64) (dp *DataPartition, err error) {
return vol.dataPartitions.get(partitionID)
}
func (vol *Vol) addMetaPartitions(c *Cluster, count int) (err error) {
// add extra meta partitions at a time
var (
start uint64
end uint64
)
vol.createMpMutex.Lock()
defer vol.createMpMutex.Unlock()
// update End of the maxMetaPartition range
maxPartitionId := vol.maxPartitionID()
rearMetaPartition := vol.MetaPartitions[maxPartitionId]
oldEnd := rearMetaPartition.End
end = rearMetaPartition.MaxInodeID + gConfig.MetaPartitionInodeIdStep
if err = rearMetaPartition.canSplit(end, gConfig.MetaPartitionInodeIdStep, false); err != nil {
return err
}
rearMetaPartition.End = end
if err = c.syncUpdateMetaPartition(rearMetaPartition); err != nil {
rearMetaPartition.End = oldEnd
log.LogErrorf("action[addMetaPartitions] split partition partitionID[%v] err[%v]", rearMetaPartition.PartitionID, err)
return
}
// create new meta partitions
for i := 0; i < count; i++ {
start = end + 1
end = start + gConfig.MetaPartitionInodeIdStep
if end > (defaultMaxMetaPartitionInodeID - gConfig.MetaPartitionInodeIdStep) {
end = defaultMaxMetaPartitionInodeID
log.LogWarnf("action[addMetaPartitions] vol[%v] add too many meta partition ,partition range overflow ! ", vol.Name)
}
if i == count-1 {
end = defaultMaxMetaPartitionInodeID
}
if err = vol.createMetaPartition(c, start, end); err != nil {
log.LogErrorf("action[addMetaPartitions] vol[%v] add meta partition err[%v]", vol.Name, err)
break
}
if end == defaultMaxMetaPartitionInodeID {
break
}
}
return
}
func (vol *Vol) initMetaPartitions(c *Cluster, count int) (err error) {
// initialize k meta partitionMap at a time
var (
start uint64
end uint64
)
if count < defaultInitMetaPartitionCount {
count = defaultInitMetaPartitionCount
}
if count > defaultMaxInitMetaPartitionCount {
count = defaultMaxInitMetaPartitionCount
}
vol.createMpMutex.Lock()
for index := 0; index < count; index++ {
if index != 0 {
start = end + 1
}
end = gConfig.MetaPartitionInodeIdStep * uint64(index+1)
if index == count-1 {
end = defaultMaxMetaPartitionInodeID
}
if err = vol.createMetaPartition(c, start, end); err != nil {
log.LogErrorf("action[initMetaPartitions] vol[%v] init meta partition err[%v]", vol.Name, err)
break
}
}
vol.createMpMutex.Unlock()
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
if len(vol.MetaPartitions) != count {
err = fmt.Errorf("action[initMetaPartitions] vol[%v] init meta partition failed,mpCount[%v],expectCount[%v],err[%v]",
vol.Name, len(vol.MetaPartitions), count, err)
}
return
}
func (vol *Vol) initDataPartitions(c *Cluster, dpCount int) (err error) {
if dpCount == 0 {
dpCount = defaultInitDataPartitionCnt
}
// initialize k data partitionMap at a time
err = c.batchCreateDataPartition(vol, dpCount, true)
return
}
func (vol *Vol) checkDataPartitions(c *Cluster) (cnt int) {
if vol.getDataPartitionsCount() == 0 && vol.Status != proto.VolStatusMarkDelete && proto.IsHot(vol.VolType) {
c.batchCreateDataPartition(vol, 1, false)
}
shouldDpInhibitWriteByVolFull := vol.shouldInhibitWriteBySpaceFull()
totalPreloadCapacity := uint64(0)
partitions := vol.dataPartitions.clonePartitions()
for _, dp := range partitions {
if proto.IsPreLoadDp(dp.PartitionType) {
now := time.Now().Unix()
if now > dp.PartitionTTL {
log.LogWarnf("[checkDataPartitions] dp(%d) is deleted because of ttl expired, now(%d), ttl(%d)", dp.PartitionID, now, dp.PartitionTTL)
vol.deleteDataPartition(c, dp)
continue
}
startTime := dp.dataNodeStartTime()
if now-dp.createTime > 600 && dp.used == 0 && now-startTime > 600 {
log.LogWarnf("[checkDataPartitions] dp(%d) is deleted because of clear, now(%d), create(%d), start(%d)",
dp.PartitionID, now, dp.createTime, startTime)
vol.deleteDataPartition(c, dp)
continue
}
totalPreloadCapacity += dp.total / util.GB
}
dp.checkReplicaStatus(c.cfg.DataPartitionTimeOutSec)
dp.checkStatus(c.Name, true, c.cfg.DataPartitionTimeOutSec, c, shouldDpInhibitWriteByVolFull, vol.Forbidden)
dp.checkLeader(c.Name, c.cfg.DataPartitionTimeOutSec)
dp.checkMissingReplicas(c.Name, c.leaderInfo.addr, c.cfg.MissingDataPartitionInterval, c.cfg.IntervalToAlarmMissingDataPartition)
dp.checkReplicaNum(c, vol)
if time.Now().Unix()-vol.createTime < defaultIntervalToCheckHeartbeat*3 && !vol.Forbidden {
dp.setReadWrite()
}
if dp.Status == proto.ReadWrite {
cnt++
}
dp.checkDiskError(c.Name, c.leaderInfo.addr)
dp.checkReplicationTask(c.Name, vol.dataPartitionSize)
}
if overSoldFactor > 0 {
totalPreloadCapacity = uint64(float32(totalPreloadCapacity) / overSoldFactor)
}
vol.preloadCapacity = totalPreloadCapacity
if vol.preloadCapacity != 0 {
log.LogDebugf("[checkDataPartitions] vol(%v) totalPreloadCapacity(%v GB), overSoldFactor(%v)",
vol.Name, totalPreloadCapacity, overSoldFactor)
}
return
}
func (vol *Vol) loadDataPartition(c *Cluster) {
partitions, startIndex := vol.dataPartitions.getDataPartitionsToBeChecked(c.cfg.PeriodToLoadALLDataPartitions)
if len(partitions) == 0 {
return
}
c.waitForResponseToLoadDataPartition(partitions)
msg := fmt.Sprintf("action[loadDataPartition] vol[%v],checkStartIndex:%v checkCount:%v",
vol.Name, startIndex, len(partitions))
log.LogInfo(msg)
}
func (vol *Vol) releaseDataPartitions(releaseCount int, afterLoadSeconds int64) {
partitions, startIndex := vol.dataPartitions.getDataPartitionsToBeReleased(releaseCount, afterLoadSeconds)
if len(partitions) == 0 {
return
}
vol.dataPartitions.freeMemOccupiedByDataPartitions(partitions)
msg := fmt.Sprintf("action[freeMemOccupiedByDataPartitions] vol[%v] release data partition start:%v releaseCount:%v",
vol.Name, startIndex, len(partitions))
log.LogInfo(msg)
}
func (vol *Vol) tryUpdateDpReplicaNum(c *Cluster, partition *DataPartition) (err error) {
partition.RLock()
defer partition.RUnlock()
if partition.isRecover || vol.dpReplicaNum != 2 || partition.ReplicaNum != 3 || len(partition.Hosts) != 2 {
return
}
if partition.isSpecialReplicaCnt() {
return
}
oldReplicaNum := partition.ReplicaNum
partition.ReplicaNum = partition.ReplicaNum - 1
if err = c.syncUpdateDataPartition(partition); err != nil {
partition.ReplicaNum = oldReplicaNum
}
return
}
func (vol *Vol) isOkUpdateRepCnt() (ok bool, rsp []uint64) {
if proto.IsCold(vol.VolType) {
return
}
ok = true
dps := vol.cloneDataPartitionMap()
for _, dp := range dps {
if vol.dpReplicaNum != dp.ReplicaNum {
rsp = append(rsp, dp.PartitionID)
ok = false
// output dps detail info
if len(rsp) > 20 {
return
}
}
}
return ok, rsp
}
func (vol *Vol) checkReplicaNum(c *Cluster) {
if !vol.NeedToLowerReplica {
return
}
var err error
if proto.IsCold(vol.VolType) {
return
}
dps := vol.cloneDataPartitionMap()
cnt := 0
for _, dp := range dps {
host := dp.getToBeDecommissionHost(int(vol.dpReplicaNum))
if host == "" {
continue
}
if err = dp.removeOneReplicaByHost(c, host, vol.dpReplicaNum == dp.ReplicaNum); err != nil {
if dp.isSpecialReplicaCnt() && len(dp.Hosts) > 1 {
log.LogWarnf("action[checkReplicaNum] removeOneReplicaByHost host [%v],vol[%v],err[%v]", host, vol.Name, err)
continue
}
log.LogErrorf("action[checkReplicaNum] removeOneReplicaByHost host [%v],vol[%v],err[%v]", host, vol.Name, err)
continue
}
cnt++
if cnt > 100 {
return
}
}
vol.NeedToLowerReplica = false
}
func (vol *Vol) checkMetaPartitions(c *Cluster) {
var tasks []*proto.AdminTask
metaPartitionInodeIdStep := gConfig.MetaPartitionInodeIdStep
maxPartitionID := vol.maxPartitionID()
mps := vol.cloneMetaPartitionMap()
var (
doSplit bool
err error
)
for _, mp := range mps {
doSplit = mp.checkStatus(c.Name, true, int(vol.mpReplicaNum), maxPartitionID, metaPartitionInodeIdStep, vol.Forbidden)
if doSplit && !c.cfg.DisableAutoCreate {
nextStart := mp.MaxInodeID + metaPartitionInodeIdStep
log.LogInfof(c.Name, fmt.Sprintf("cluster[%v],vol[%v],meta partition[%v] splits start[%v] maxinodeid:[%v] default step:[%v],nextStart[%v]",
c.Name, vol.Name, mp.PartitionID, mp.Start, mp.MaxInodeID, metaPartitionInodeIdStep, nextStart))
if err = vol.splitMetaPartition(c, mp, nextStart, metaPartitionInodeIdStep, false); err != nil {
Warn(c.Name, fmt.Sprintf("cluster[%v],vol[%v],meta partition[%v] splits failed,err[%v]", c.Name, vol.Name, mp.PartitionID, err))
}
}
mp.checkLeader(c.Name)
mp.checkReplicaNum(c, vol.Name, vol.mpReplicaNum)
mp.checkEnd(c, maxPartitionID)
mp.reportMissingReplicas(c.Name, c.leaderInfo.addr, defaultMetaPartitionTimeOutSec, defaultIntervalToAlarmMissingMetaPartition)
tasks = append(tasks, mp.replicaCreationTasks(c.Name, vol.Name)...)
}
c.addMetaNodeTasks(tasks)
vol.checkSplitMetaPartition(c, metaPartitionInodeIdStep)
}
func (vol *Vol) checkSplitMetaPartition(c *Cluster, metaPartitionInodeStep uint64) {
maxPartitionID := vol.maxPartitionID()
maxMP, err := vol.metaPartition(maxPartitionID)
if err != nil {
return
}
// Any of the following conditions will trigger max mp split
// 1. The memory of the metanode which max mp belongs to reaches the threshold
// 2. The number of inodes managed by max mp reaches the threshold(0.75)
// 3. The number of RW mp is less than 3
maxMPInodeUsedRatio := float64(maxMP.MaxInodeID-maxMP.Start) / float64(metaPartitionInodeStep)
RWMPNum, isHeartBeatDone := vol.getRWMetaPartitionNum()
if !isHeartBeatDone {
log.LogInfof("Not all volume[%s] mp heartbeat is done, skip mp split", vol.Name)
return
}
if maxMP.memUsedReachThreshold(c.Name, vol.Name) || RWMPNum < lowerLimitRWMetaPartition ||
maxMPInodeUsedRatio > metaPartitionInodeUsageThreshold {
end := maxMP.MaxInodeID + metaPartitionInodeStep/4
if RWMPNum < lowerLimitRWMetaPartition {
end = maxMP.MaxInodeID + metaPartitionInodeStep
}
if err := vol.splitMetaPartition(c, maxMP, end, metaPartitionInodeStep, true); err != nil {
msg := fmt.Sprintf("action[checkSplitMetaPartition],split meta maxMP[%v] failed,err[%v]\n",
maxMP.PartitionID, err)
Warn(c.Name, msg)
}
log.LogInfof("volume[%v] split MaxMP[%v], MaxInodeID[%d] Start[%d] RWMPNum[%d] maxMPInodeUsedRatio[%.2f]",
vol.Name, maxPartitionID, maxMP.MaxInodeID, maxMP.Start, RWMPNum, maxMPInodeUsedRatio)
}
return
}
func (mp *MetaPartition) memUsedReachThreshold(clusterName, volName string) bool {
liveReplicas := mp.getLiveReplicas()
foundReadonlyReplica := false
var readonlyReplica *MetaReplica
for _, replica := range liveReplicas {
if replica.Status == proto.ReadOnly {
foundReadonlyReplica = true
readonlyReplica = replica
break
}
}
if !foundReadonlyReplica || readonlyReplica == nil {
return false
}
if readonlyReplica.metaNode.isWritable() {
msg := fmt.Sprintf("action[checkSplitMetaPartition] vol[%v],max meta parition[%v] status is readonly\n",
volName, mp.PartitionID)
Warn(clusterName, msg)
return false
}
return true
}
func (vol *Vol) cloneMetaPartitionMap() (mps map[uint64]*MetaPartition) {
mps = make(map[uint64]*MetaPartition, 0)
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
for _, mp := range vol.MetaPartitions {
mps[mp.PartitionID] = mp
}
return
}
func (vol *Vol) setMpRdOnly() {
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
for _, mp := range vol.MetaPartitions {
if mp.Status != proto.Unavailable {
mp.Status = proto.ReadOnly
}
}
}
func (vol *Vol) cloneDataPartitionMap() (dps map[uint64]*DataPartition) {
vol.dataPartitions.RLock()
defer vol.dataPartitions.RUnlock()
dps = make(map[uint64]*DataPartition, 0)
for _, dp := range vol.dataPartitions.partitionMap {
dps[dp.PartitionID] = dp
}
return
}
func (vol *Vol) setDpRdOnly() {
vol.dataPartitions.RLock()
defer vol.dataPartitions.RUnlock()
for _, dp := range vol.dataPartitions.partitionMap {
if dp.Status != proto.Unavailable {
dp.Status = proto.ReadOnly
}
}
}
func (vol *Vol) setStatus(status uint8) {
vol.volLock.Lock()
defer vol.volLock.Unlock()
vol.Status = status
}
func (vol *Vol) status() uint8 {
vol.volLock.RLock()
defer vol.volLock.RUnlock()
return vol.Status
}
func (vol *Vol) capacity() uint64 {
vol.volLock.RLock()
defer vol.volLock.RUnlock()
return vol.Capacity
}
func (vol *Vol) autoDeleteDp(c *Cluster) {
if vol.dataPartitions == nil {
return
}
maxSize := overSoldCap(vol.CacheCapacity * util.GB)
maxCnt := maxSize / vol.dataPartitionSize
if maxSize%vol.dataPartitionSize != 0 {
maxCnt++
}
partitions := vol.dataPartitions.clonePartitions()
for _, dp := range partitions {
if !proto.IsCacheDp(dp.PartitionType) {
continue
}
if maxCnt > 0 {
maxCnt--
continue
}
log.LogInfof("[autoDeleteDp] start delete dp, id[%d]", dp.PartitionID)
vol.deleteDataPartition(c, dp)
}
}
func (vol *Vol) checkAutoDataPartitionCreation(c *Cluster) {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("checkAutoDataPartitionCreation occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkAutoDataPartitionCreation occurred panic")
}
}()
if ok, _ := vol.needCreateDataPartition(); !ok {
return
}
vol.setStatus(proto.VolStatusNormal)
log.LogInfof("action[autoCreateDataPartitions] vol[%v] before autoCreateDataPartitions", vol.Name)
if !c.DisableAutoAllocate && !vol.Forbidden {
vol.autoCreateDataPartitions(c)
}
}
func (vol *Vol) shouldInhibitWriteBySpaceFull() bool {
if !vol.DpReadOnlyWhenVolFull {
return false
}
if vol.capacity() == 0 {
return false
}
if !proto.IsHot(vol.VolType) {
return false
}
usedSpace := vol.totalUsedSpace() / util.GB
if usedSpace >= vol.capacity() {
return true
}
return false
}
func (vol *Vol) needCreateDataPartition() (ok bool, err error) {
ok = false
if vol.status() == proto.VolStatusMarkDelete {
err = proto.ErrVolNotExists
return
}
if vol.capacity() == 0 {
err = proto.ErrVolNoAvailableSpace
return
}
if proto.IsHot(vol.VolType) {
if vol.shouldInhibitWriteBySpaceFull() {
vol.setAllDataPartitionsToReadOnly()
err = proto.ErrVolNoAvailableSpace
return
}
ok = true
return
}
// cold
if vol.CacheAction == proto.NoCache && vol.CacheRule == "" {
err = proto.ErrVolNoCacheAndRule
return
}
ok = true
return
}
func (vol *Vol) autoCreateDataPartitions(c *Cluster) {
if time.Since(vol.dataPartitions.lastAutoCreateTime) < time.Minute {
return
}
if c.cfg.DisableAutoCreate {
// if disable auto create, once alloc size is over capacity, not allow to create new dp
allocSize := uint64(len(vol.dataPartitions.partitions)) * vol.dataPartitionSize
totalSize := vol.capacity() * util.GB
if allocSize > totalSize {
return
}
if vol.dataPartitions.readableAndWritableCnt < minNumOfRWDataPartitions {
c.batchCreateDataPartition(vol, minNumOfRWDataPartitions, false)
log.LogWarnf("autoCreateDataPartitions: readWrite less than 10, alloc new 10 partitions, vol %s", vol.Name)
}
return
}
if proto.IsCold(vol.VolType) {
vol.dataPartitions.lastAutoCreateTime = time.Now()
maxSize := overSoldCap(vol.CacheCapacity * util.GB)
allocSize := uint64(0)
for _, dp := range vol.cloneDataPartitionMap() {
if !proto.IsCacheDp(dp.PartitionType) {
continue
}
allocSize += dp.total
}
if maxSize <= allocSize {
log.LogInfof("action[autoCreateDataPartitions] (%s) no need to create again, alloc [%d], max [%d]", vol.Name, allocSize, maxSize)
return
}
count := (maxSize-allocSize-1)/vol.dataPartitionSize + 1
log.LogInfof("action[autoCreateDataPartitions] vol[%v] count[%v]", vol.Name, count)
c.batchCreateDataPartition(vol, int(count), false)
return
}
if (vol.Capacity > 200000 && vol.dataPartitions.readableAndWritableCnt < 200) || vol.dataPartitions.readableAndWritableCnt < minNumOfRWDataPartitions {
vol.dataPartitions.lastAutoCreateTime = time.Now()
count := vol.calculateExpansionNum()
log.LogInfof("action[autoCreateDataPartitions] vol[%v] count[%v]", vol.Name, count)
c.batchCreateDataPartition(vol, count, false)
}
}
// Calculate the expansion number (the number of data partitions to be allocated to the given volume)
func (vol *Vol) calculateExpansionNum() (count int) {
c := float64(vol.Capacity) * volExpansionRatio * float64(util.GB) / float64(util.DefaultDataPartitionSize)
switch {
case c < minNumOfRWDataPartitions:
count = minNumOfRWDataPartitions
case c > maxNumberOfDataPartitionsForExpansion:
count = maxNumberOfDataPartitionsForExpansion
default:
count = int(c)
}
return
}
func (vol *Vol) setAllDataPartitionsToReadOnly() {
vol.dataPartitions.setAllDataPartitionsToReadOnly()
}
func (vol *Vol) totalUsedSpace() uint64 {
return vol.totalUsedSpaceByMeta(false)
}
func (vol *Vol) totalUsedSpaceByMeta(byMeta bool) uint64 {
if proto.IsCold(vol.VolType) || byMeta {
return vol.ebsUsedSpace()
}
return vol.cfsUsedSpace()
}
func (vol *Vol) cfsUsedSpace() uint64 {
return vol.dataPartitions.totalUsedSpace()
}
func (vol *Vol) sendViewCacheToFollower(c *Cluster) {
var err error
log.LogInfof("action[asyncSendPartitionsToFollower]")
metadata := new(RaftCmd)
metadata.Op = opSyncDataPartitionsView
metadata.K = vol.Name
metadata.V = vol.dataPartitions.getDataPartitionResponseCache()
if err = c.submit(metadata); err != nil {
log.LogErrorf("action[asyncSendPartitionsToFollower] error [%v]", err)
}
log.LogInfof("action[asyncSendPartitionsToFollower] finished")
}
func (vol *Vol) ebsUsedSpace() uint64 {
size := uint64(0)
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
for _, pt := range vol.MetaPartitions {
size += pt.dataSize()
}
return size
}
func (vol *Vol) updateViewCache(c *Cluster) {
view := proto.NewVolView(vol.Name, vol.Status, vol.FollowerRead, vol.createTime, vol.CacheTTL, vol.VolType, vol.DeleteLockTime)
view.SetOwner(vol.Owner)
view.SetOSSSecure(vol.OSSAccessKey, vol.OSSSecretKey)
mpViews := vol.getMetaPartitionsView()
view.MetaPartitions = mpViews
mpViewsReply := newSuccessHTTPReply(mpViews)
mpsBody, err := json.Marshal(mpViewsReply)
if err != nil {
log.LogErrorf("action[updateViewCache] failed,vol[%v],err[%v]", vol.Name, err)
return
}
vol.setMpsCache(mpsBody)
// dpResps := vol.dataPartitions.getDataPartitionsView(0)
// view.DataPartitions = dpResps
view.DomainOn = vol.domainOn
viewReply := newSuccessHTTPReply(view)
body, err := json.Marshal(viewReply)
if err != nil {
log.LogErrorf("action[updateViewCache] failed,vol[%v],err[%v]", vol.Name, err)
return
}
vol.setViewCache(body)
}
func (vol *Vol) getMetaPartitionsView() (mpViews []*proto.MetaPartitionView) {
mps := make(map[uint64]*MetaPartition)
vol.mpsLock.RLock()
for key, mp := range vol.MetaPartitions {
mps[key] = mp
}
vol.mpsLock.RUnlock()
mpViews = make([]*proto.MetaPartitionView, 0)
for _, mp := range mps {
mpViews = append(mpViews, getMetaPartitionView(mp))
}
return
}
func (vol *Vol) setMpsCache(body []byte) {
vol.volLock.Lock()
defer vol.volLock.Unlock()
vol.mpsCache = body
}
func (vol *Vol) getMpsCache() []byte {
vol.volLock.RLock()
defer vol.volLock.RUnlock()
return vol.mpsCache
}
func (vol *Vol) setViewCache(body []byte) {
vol.volLock.Lock()
defer vol.volLock.Unlock()
vol.viewCache = body
}
func (vol *Vol) getViewCache() []byte {
vol.volLock.RLock()
defer vol.volLock.RUnlock()
return vol.viewCache
}
func (vol *Vol) deleteDataPartition(c *Cluster, dp *DataPartition) {
var addrs []string
for _, replica := range dp.Replicas {
addrs = append(addrs, replica.Addr)
}
for _, addr := range addrs {
if err := vol.deleteDataPartitionFromDataNode(c, dp.createTaskToDeleteDataPartition(addr)); err != nil {
log.LogErrorf("[deleteDataPartitionFromDataNode] delete data replica from datanode fail, id %d, err %s", dp.PartitionID, err.Error())
}
}
vol.dataPartitions.del(dp)
err := c.syncDeleteDataPartition(dp)
if err != nil {
log.LogErrorf("[deleteDataPartition] delete data partition from store fail, [%d], err: %s", dp.PartitionID, err.Error())
return
}
log.LogInfof("[deleteDataPartition] delete data partition success, [%d]", dp.PartitionID)
}
// Periodically check the volume's status.
// If an volume is marked as deleted, then generate corresponding delete task (meta partition or data partition)
// If all the meta partition and data partition of this volume have been deleted, then delete this volume.
func (vol *Vol) checkStatus(c *Cluster) {
if !atomic.CompareAndSwapInt32(&vol.VersionMgr.checkStatus, 0, 1) {
return
}
defer func() {
atomic.StoreInt32(&vol.VersionMgr.checkStatus, 0)
if r := recover(); r != nil {
log.LogWarnf("checkStatus occurred panic,err[%v]", r)
WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
"checkStatus occurred panic")
}
}()
vol.updateViewCache(c)
vol.volLock.Lock()
defer vol.volLock.Unlock()
if vol.Status != proto.VolStatusMarkDelete {
return
}
log.LogInfof("action[volCheckStatus] vol[%v],status[%v]", vol.Name, vol.Status)
metaTasks := vol.getTasksToDeleteMetaPartitions()
dataTasks := vol.getTasksToDeleteDataPartitions()
if len(metaTasks) == 0 && len(dataTasks) == 0 {
vol.deleteVolFromStore(c)
}
go func() {
for _, metaTask := range metaTasks {
vol.deleteMetaPartitionFromMetaNode(c, metaTask)
}
for _, dataTask := range dataTasks {
vol.deleteDataPartitionFromDataNode(c, dataTask)
}
}()
return
}
func (vol *Vol) deleteMetaPartitionFromMetaNode(c *Cluster, task *proto.AdminTask) {
mp, err := vol.metaPartition(task.PartitionID)
if err != nil {
return
}
metaNode, err := c.metaNode(task.OperatorAddr)
if err != nil {
return
}
mp.RLock()
_, err = mp.getMetaReplica(task.OperatorAddr)
mp.RUnlock()
if err != nil {
log.LogWarnf("deleteMetaPartitionFromMetaNode (%s) maybe alread been deleted", task.ToString())
return
}
_, err = metaNode.Sender.syncSendAdminTask(task)
if err != nil {
log.LogErrorf("action[deleteMetaPartition] vol[%v],meta partition[%v],err[%v]", mp.volName, mp.PartitionID, err)
return
}
mp.Lock()
mp.removeReplicaByAddr(metaNode.Addr)
mp.removeMissingReplica(metaNode.Addr)
mp.Unlock()
return
}
func (vol *Vol) deleteDataPartitionFromDataNode(c *Cluster, task *proto.AdminTask) (err error) {
dp, err := vol.getDataPartitionByID(task.PartitionID)
if err != nil {
return
}
dataNode, err := c.dataNode(task.OperatorAddr)
if err != nil {
return
}
dp.RLock()
_, ok := dp.hasReplica(task.OperatorAddr)
dp.RUnlock()
if !ok {
log.LogWarnf("deleteDataPartitionFromDataNode task(%s) maybe already executed", task.ToString())
return
}
_, err = dataNode.TaskManager.syncSendAdminTask(task)
if err != nil {
log.LogErrorf("action[deleteDataReplica] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
return
}
dp.Lock()
dp.removeReplicaByAddr(dataNode.Addr)
dp.checkAndRemoveMissReplica(dataNode.Addr)
if err = dp.update("deleteDataReplica", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
dp.Unlock()
return
}
dp.Unlock()
return
}
func (vol *Vol) deleteVolFromStore(c *Cluster) (err error) {
log.LogWarnf("deleteVolFromStore vol %v", vol.Name)
if err = c.syncDeleteVol(vol); err != nil {
return
}
// delete the metadata of the meta and data partitionMap first
vol.deleteDataPartitionsFromStore(c)
vol.deleteMetaPartitionsFromStore(c)
// then delete the volume
c.deleteVol(vol.Name)
c.volStatInfo.Delete(vol.Name)
c.DelBucketLifecycle(vol.Name)
return
}
func (vol *Vol) deleteMetaPartitionsFromStore(c *Cluster) {
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
for _, mp := range vol.MetaPartitions {
c.syncDeleteMetaPartition(mp)
}
return
}
func (vol *Vol) deleteDataPartitionsFromStore(c *Cluster) {
vol.dataPartitions.RLock()
defer vol.dataPartitions.RUnlock()
for _, dp := range vol.dataPartitions.partitions {
c.syncDeleteDataPartition(dp)
}
}
func (vol *Vol) getTasksToDeleteMetaPartitions() (tasks []*proto.AdminTask) {
vol.mpsLock.RLock()
defer vol.mpsLock.RUnlock()
tasks = make([]*proto.AdminTask, 0)
for _, mp := range vol.MetaPartitions {
log.LogDebugf("get delete task from vol(%s) mp(%d)", vol.Name, mp.PartitionID)
for _, replica := range mp.Replicas {
log.LogDebugf("get delete task from vol(%s) mp(%d),replica(%v)", vol.Name, mp.PartitionID, replica.Addr)
tasks = append(tasks, replica.createTaskToDeleteReplica(mp.PartitionID))
}
}
return
}
func (vol *Vol) getTasksToDeleteDataPartitions() (tasks []*proto.AdminTask) {
tasks = make([]*proto.AdminTask, 0)
vol.dataPartitions.RLock()
defer vol.dataPartitions.RUnlock()
for _, dp := range vol.dataPartitions.partitions {
for _, replica := range dp.Replicas {
tasks = append(tasks, dp.createTaskToDeleteDataPartition(replica.Addr))
}
}
return
}
func (vol *Vol) getDataPartitionsCount() (count int) {
vol.volLock.RLock()
count = len(vol.dataPartitions.partitionMap)
vol.volLock.RUnlock()
return
}
func (vol *Vol) String() string {
return fmt.Sprintf("name[%v],dpNum[%v],mpNum[%v],cap[%v],status[%v]",
vol.Name, vol.dpReplicaNum, vol.mpReplicaNum, vol.Capacity, vol.Status)
}
func (vol *Vol) doSplitMetaPartition(c *Cluster, mp *MetaPartition, end uint64, metaPartitionInodeIdStep uint64, ignoreNoLeader bool) (nextMp *MetaPartition, err error) {
mp.Lock()
defer mp.Unlock()
if err = mp.canSplit(end, metaPartitionInodeIdStep, ignoreNoLeader); err != nil {
return
}
log.LogWarnf("action[splitMetaPartition],partition[%v],start[%v],end[%v],new end[%v]", mp.PartitionID, mp.Start, mp.End, end)
cmdMap := make(map[string]*RaftCmd, 0)
oldEnd := mp.End
mp.End = end
updateMpRaftCmd, err := c.buildMetaPartitionRaftCmd(opSyncUpdateMetaPartition, mp)
if err != nil {
return
}
cmdMap[updateMpRaftCmd.K] = updateMpRaftCmd
if nextMp, err = vol.doCreateMetaPartition(c, mp.End+1, defaultMaxMetaPartitionInodeID); err != nil {
Warn(c.Name, fmt.Sprintf("action[updateEnd] clusterID[%v] partitionID[%v] create meta partition err[%v]",
c.Name, mp.PartitionID, err))
log.LogErrorf("action[updateEnd] partitionID[%v] err[%v]", mp.PartitionID, err)
return
}
addMpRaftCmd, err := c.buildMetaPartitionRaftCmd(opSyncAddMetaPartition, nextMp)
if err != nil {
return
}
cmdMap[addMpRaftCmd.K] = addMpRaftCmd
if err = c.syncBatchCommitCmd(cmdMap); err != nil {
mp.End = oldEnd
return nil, errors.NewError(err)
}
mp.updateInodeIDRangeForAllReplicas()
mp.addUpdateMetaReplicaTask(c)
return
}
func (vol *Vol) splitMetaPartition(c *Cluster, mp *MetaPartition, end uint64, metaPartitionInodeIdStep uint64, ignoreNoLeader bool) (err error) {
if c.DisableAutoAllocate {
err = errors.NewErrorf("cluster auto allocate is disable")
return
}
if vol.Forbidden {
err = errors.NewErrorf("volume %v is forbidden", vol.Name)
return
}
vol.createMpMutex.Lock()
defer vol.createMpMutex.Unlock()
maxPartitionID := vol.maxPartitionID()
if maxPartitionID != mp.PartitionID {
err = fmt.Errorf("mp[%v] is not the last meta partition[%v]", mp.PartitionID, maxPartitionID)
return
}
nextMp, err := vol.doSplitMetaPartition(c, mp, end, metaPartitionInodeIdStep, ignoreNoLeader)
if err != nil {
return
}
vol.addMetaPartition(nextMp)
log.LogWarnf("action[splitMetaPartition],next partition[%v],start[%v],end[%v]", nextMp.PartitionID, nextMp.Start, nextMp.End)
return
}
func (vol *Vol) createMetaPartition(c *Cluster, start, end uint64) (err error) {
var mp *MetaPartition
if mp, err = vol.doCreateMetaPartition(c, start, end); err != nil {
return
}
if err = c.syncAddMetaPartition(mp); err != nil {
return errors.NewError(err)
}
vol.addMetaPartition(mp)
return
}
func (vol *Vol) doCreateMetaPartition(c *Cluster, start, end uint64) (mp *MetaPartition, err error) {
var (
hosts []string
partitionID uint64
peers []proto.Peer
wg sync.WaitGroup
)
errChannel := make(chan error, vol.mpReplicaNum)
if c.isFaultDomain(vol) {
if hosts, peers, err = c.getHostFromDomainZone(vol.domainId, TypeMetaPartition, vol.mpReplicaNum); err != nil {
log.LogErrorf("action[doCreateMetaPartition] getHostFromDomainZone err[%v]", err)
return nil, errors.NewError(err)
}
} else {
var excludeZone []string
zoneNum := c.decideZoneNum(vol.crossZone)
if hosts, peers, err = c.getHostFromNormalZone(TypeMetaPartition, excludeZone, nil, nil, int(vol.mpReplicaNum), zoneNum, vol.zoneName); err != nil {
log.LogErrorf("action[doCreateMetaPartition] getHostFromNormalZone err[%v]", err)
return nil, errors.NewError(err)
}
}
log.LogInfof("target meta hosts:%v,peers:%v", hosts, peers)
if partitionID, err = c.idAlloc.allocateMetaPartitionID(); err != nil {
return nil, errors.NewError(err)
}
mp = newMetaPartition(partitionID, start, end, vol.mpReplicaNum, vol.Name, vol.ID, vol.VersionMgr.getLatestVer())
mp.setHosts(hosts)
mp.setPeers(peers)
for _, host := range hosts {
wg.Add(1)
go func(host string) {
defer func() {
wg.Done()
}()
if err = c.syncCreateMetaPartitionToMetaNode(host, mp); err != nil {
errChannel <- err
return
}
mp.Lock()
defer mp.Unlock()
if err = mp.afterCreation(host, c); err != nil {
errChannel <- err
}
}(host)
}
wg.Wait()
select {
case err = <-errChannel:
for _, host := range hosts {
wg.Add(1)
go func(host string) {
defer func() {
wg.Done()
}()
mr, err := mp.getMetaReplica(host)
if err != nil {
return
}
task := mr.createTaskToDeleteReplica(mp.PartitionID)
tasks := make([]*proto.AdminTask, 0)
tasks = append(tasks, task)
c.addMetaNodeTasks(tasks)
}(host)
}
wg.Wait()
return nil, errors.NewError(err)
default:
mp.Status = proto.ReadWrite
}
log.LogInfof("action[doCreateMetaPartition] success,volName[%v],partition[%v],start[%v],end[%v]", vol.Name, partitionID, start, end)
return
}
func setVolFromArgs(args *VolVarargs, vol *Vol) {
vol.zoneName = args.zoneName
vol.Capacity = args.capacity
vol.DeleteLockTime = args.deleteLockTime
vol.FollowerRead = args.followerRead
vol.authenticate = args.authenticate
vol.enablePosixAcl = args.enablePosixAcl
vol.DpReadOnlyWhenVolFull = args.dpReadOnlyWhenVolFull
vol.enableQuota = args.enableQuota
vol.enableTransaction = args.enableTransaction
vol.txTimeout = args.txTimeout
vol.txConflictRetryNum = args.txConflictRetryNum
vol.txConflictRetryInterval = args.txConflictRetryInterval
vol.txOpLimit = args.txOpLimit
vol.dpReplicaNum = args.dpReplicaNum
if proto.IsCold(vol.VolType) {
coldArgs := args.coldArgs
vol.CacheLRUInterval = coldArgs.cacheLRUInterval
vol.CacheLowWater = coldArgs.cacheLowWater
vol.CacheHighWater = coldArgs.cacheHighWater
vol.CacheTTL = coldArgs.cacheTtl
vol.CacheThreshold = coldArgs.cacheThreshold
vol.CacheAction = coldArgs.cacheAction
vol.CacheRule = coldArgs.cacheRule
vol.CacheCapacity = coldArgs.cacheCap
vol.EbsBlkSize = coldArgs.objBlockSize
}
vol.description = args.description
vol.dpSelectorName = args.dpSelectorName
vol.dpSelectorParm = args.dpSelectorParm
}
func getVolVarargs(vol *Vol) *VolVarargs {
args := &coldVolArgs{
objBlockSize: vol.EbsBlkSize,
cacheCap: vol.CacheCapacity,
cacheAction: vol.CacheAction,
cacheThreshold: vol.CacheThreshold,
cacheTtl: vol.CacheTTL,
cacheHighWater: vol.CacheHighWater,
cacheLowWater: vol.CacheLowWater,
cacheLRUInterval: vol.CacheLRUInterval,
cacheRule: vol.CacheRule,
}
return &VolVarargs{
zoneName: vol.zoneName,
description: vol.description,
capacity: vol.Capacity,
deleteLockTime: vol.DeleteLockTime,
followerRead: vol.FollowerRead,
authenticate: vol.authenticate,
dpSelectorName: vol.dpSelectorName,
dpSelectorParm: vol.dpSelectorParm,
enablePosixAcl: vol.enablePosixAcl,
enableQuota: vol.enableQuota,
dpReplicaNum: vol.dpReplicaNum,
enableTransaction: vol.enableTransaction,
txTimeout: vol.txTimeout,
txConflictRetryNum: vol.txConflictRetryNum,
txConflictRetryInterval: vol.txConflictRetryInterval,
txOpLimit: vol.txOpLimit,
coldArgs: args,
dpReadOnlyWhenVolFull: vol.DpReadOnlyWhenVolFull,
}
}
func (vol *Vol) initQuotaManager(c *Cluster) {
vol.quotaManager = &MasterQuotaManager{
MpQuotaInfoMap: make(map[uint64][]*proto.QuotaReportInfo),
IdQuotaInfoMap: make(map[uint32]*proto.QuotaInfo),
c: c,
vol: vol,
}
}
func (vol *Vol) loadQuotaManager(c *Cluster) (err error) {
vol.quotaManager = &MasterQuotaManager{
MpQuotaInfoMap: make(map[uint64][]*proto.QuotaReportInfo),
IdQuotaInfoMap: make(map[uint32]*proto.QuotaInfo),
c: c,
vol: vol,
}
result, err := c.fsm.store.SeekForPrefix([]byte(quotaPrefix + strconv.FormatUint(vol.ID, 10) + keySeparator))
if err != nil {
err = fmt.Errorf("loadQuotaManager get quota failed, err [%v]", err)
return err
}
for _, value := range result {
quotaInfo := &proto.QuotaInfo{}
if err = json.Unmarshal(value, quotaInfo); err != nil {
log.LogErrorf("loadQuotaManager Unmarshal fail err [%v]", err)
return err
}
log.LogDebugf("loadQuotaManager info [%v]", quotaInfo)
if vol.Name != quotaInfo.VolName {
panic(fmt.Sprintf("vol name do not match vol name [%v], quotaInfo vol name [%v]", vol.Name, quotaInfo.VolName))
}
vol.quotaManager.IdQuotaInfoMap[quotaInfo.QuotaId] = quotaInfo
}
return err
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"os"
"path"
"strconv"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// APIResponse defines the structure of the response to an HTTP request
type APIResponse struct {
Code int `json:"code"`
Msg string `json:"msg"`
Data interface{} `json:"data,omitempty"`
}
// NewAPIResponse returns a new API response.
func NewAPIResponse(code int, msg string) *APIResponse {
return &APIResponse{
Code: code,
Msg: msg,
}
}
// Marshal is a wrapper function of json.Marshal
func (api *APIResponse) Marshal() ([]byte, error) {
return json.Marshal(api)
}
// register the APIs
func (m *MetaNode) registerAPIHandler() (err error) {
http.HandleFunc("/getPartitions", m.getPartitionsHandler)
http.HandleFunc("/getPartitionById", m.getPartitionByIDHandler)
http.HandleFunc("/getLeaderPartitions", m.getLeaderPartitionsHandler)
http.HandleFunc("/getInode", m.getInodeHandler)
http.HandleFunc("/getSplitKey", m.getSplitKeyHandler)
http.HandleFunc("/getExtentsByInode", m.getExtentsByInodeHandler)
http.HandleFunc("/getEbsExtentsByInode", m.getEbsExtentsByInodeHandler)
// get all inodes of the partitionID
http.HandleFunc("/getAllInodes", m.getAllInodesHandler)
// get dentry information
http.HandleFunc("/getDentry", m.getDentryHandler)
http.HandleFunc("/getDirectory", m.getDirectoryHandler)
http.HandleFunc("/getAllDentry", m.getAllDentriesHandler)
http.HandleFunc("/getAllTxInfo", m.getAllTxHandler)
http.HandleFunc("/getParams", m.getParamsHandler)
http.HandleFunc("/getSmuxStat", m.getSmuxStatHandler)
http.HandleFunc("/getRaftStatus", m.getRaftStatusHandler)
http.HandleFunc("/genClusterVersionFile", m.genClusterVersionFileHandler)
http.HandleFunc("/getInodeSnapshot", m.getInodeSnapshotHandler)
http.HandleFunc("/getDentrySnapshot", m.getDentrySnapshotHandler)
// get tx information
http.HandleFunc("/getTx", m.getTxHandler)
return
}
func (m *MetaNode) getParamsHandler(w http.ResponseWriter,
r *http.Request) {
resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
params := make(map[string]interface{})
params[metaNodeDeleteBatchCountKey] = DeleteBatchCount()
resp.Data = params
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getPartitionsHandler] response %s", err)
}
}
func (m *MetaNode) getSmuxStatHandler(w http.ResponseWriter,
r *http.Request) {
resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
resp.Data = smuxPool.GetStat()
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getSmuxStatHandler] response %s", err)
}
}
func (m *MetaNode) getPartitionsHandler(w http.ResponseWriter,
r *http.Request) {
resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
resp.Data = m.metadataManager
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getPartitionsHandler] response %s", err)
}
}
func (m *MetaNode) getPartitionByIDHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getPartitionByIDHandler] response %s", err)
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
msg := make(map[string]interface{})
leader, _ := mp.IsLeader()
_, leaderTerm := mp.LeaderTerm()
msg["leaderAddr"] = leader
msg["leader_term"] = leaderTerm
conf := mp.GetBaseConfig()
msg["partition_id"] = conf.PartitionId
msg["partition_type"] = conf.PartitionType
msg["vol_name"] = conf.VolName
msg["start"] = conf.Start
msg["end"] = conf.End
msg["peers"] = conf.Peers
msg["nodeId"] = conf.NodeId
msg["cursor"] = conf.Cursor
resp.Data = msg
resp.Code = http.StatusOK
resp.Msg = http.StatusText(http.StatusOK)
}
func (m *MetaNode) getLeaderPartitionsHandler(w http.ResponseWriter, r *http.Request) {
resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
mps := m.metadataManager.GetLeaderPartitions()
resp.Data = mps
data, err := resp.Marshal()
if err != nil {
log.LogErrorf("json marshal error:%v", err)
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
return
}
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getPartitionsHandler] response %s", err)
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
}
}
func (m *MetaNode) getAllInodesHandler(w http.ResponseWriter, r *http.Request) {
var err error
defer func() {
if err != nil {
msg := fmt.Sprintf("[getAllInodesHandler] err(%v)", err)
if _, e := w.Write([]byte(msg)); e != nil {
log.LogErrorf("[getAllInodesHandler] failed to write response: err(%v) msg(%v)", e, msg)
}
}
}()
if err = r.ParseForm(); err != nil {
return
}
id, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
return
}
mp, err := m.metadataManager.GetPartition(id)
if err != nil {
return
}
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
return
}
var inode *Inode
f := func(i BtreeItem) bool {
var (
data []byte
e error
)
if inode != nil {
if _, e = w.Write([]byte("\n")); e != nil {
log.LogErrorf("[getAllInodesHandler] failed to write response: %v", e)
return false
}
}
inode, _ = i.(*Inode).getInoByVer(verSeq, false)
if inode == nil {
return true
}
if data, e = inode.MarshalToJSON(); e != nil {
log.LogErrorf("[getAllInodesHandler] failed to marshal to json: %v", e)
return false
}
if _, e = w.Write(data); e != nil {
log.LogErrorf("[getAllInodesHandler] failed to write response: %v", e)
return false
}
return true
}
mp.GetInodeTree().Ascend(f)
}
func (m *MetaNode) getSplitKeyHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
log.LogDebugf("getSplitKeyHandler")
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getSplitKeyHandler] response %s", err)
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
log.LogDebugf("getSplitKeyHandler")
id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
log.LogDebugf("getSplitKeyHandler")
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
resp.Msg = err.Error()
return
}
log.LogDebugf("getSplitKeyHandler")
verAll, _ := strconv.ParseBool(r.FormValue("verAll"))
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
log.LogDebugf("getSplitKeyHandler")
req := &InodeGetSplitReq{
PartitionID: pid,
Inode: id,
VerSeq: verSeq,
VerAll: verAll,
}
log.LogDebugf("getSplitKeyHandler")
p := &Packet{}
err = mp.InodeGetSplitEk(req, p)
if err != nil {
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
return
}
log.LogDebugf("getSplitKeyHandler")
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
log.LogDebugf("getSplitKeyHandler data %v", resp.Data)
} else {
log.LogDebugf("getSplitKeyHandler")
}
return
}
func (m *MetaNode) getInodeHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getInodeHandler] response %s", err)
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
resp.Msg = err.Error()
return
}
verAll, _ := strconv.ParseBool(r.FormValue("verAll"))
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
req := &InodeGetReq{
PartitionID: pid,
Inode: id,
VerSeq: verSeq,
VerAll: verAll,
}
p := &Packet{}
err = mp.InodeGet(req, p)
if err != nil {
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
return
}
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
}
return
}
func (m *MetaNode) getRaftStatusHandler(w http.ResponseWriter, r *http.Request) {
const (
paramRaftID = "id"
)
resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getRaftStatusHandler] response %s", err)
}
}()
raftID, err := strconv.ParseUint(r.FormValue(paramRaftID), 10, 64)
if err != nil {
err = fmt.Errorf("parse param %v fail: %v", paramRaftID, err)
resp.Msg = err.Error()
resp.Code = http.StatusBadRequest
return
}
raftStatus := m.raftStore.RaftStatus(raftID)
resp.Data = raftStatus
}
func (m *MetaNode) getEbsExtentsByInodeHandler(w http.ResponseWriter,
r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getEbsExtentsByInodeHandler] response %s", err)
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
req := &proto.GetExtentsRequest{
PartitionID: pid,
Inode: id,
}
p := &Packet{}
if err = mp.ObjExtentsList(req, p); err != nil {
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
return
}
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
}
return
}
func (m *MetaNode) getExtentsByInodeHandler(w http.ResponseWriter,
r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getExtentsByInodeHandler] response %s", err)
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
resp.Msg = err.Error()
return
}
verAll, _ := strconv.ParseBool(r.FormValue("verAll"))
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
req := &proto.GetExtentsRequest{
PartitionID: pid,
Inode: id,
VerSeq: uint64(verSeq),
VerAll: verAll,
}
p := &Packet{}
if err = mp.ExtentsList(req, p); err != nil {
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
return
}
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
}
return
}
func (m *MetaNode) getDentryHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
name := r.FormValue("name")
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getDentryHandler] response %s", err)
}
}()
var (
pid uint64
pIno uint64
err error
)
if pid, err = strconv.ParseUint(r.FormValue("pid"), 10, 64); err == nil {
pIno, err = strconv.ParseUint(r.FormValue("parentIno"), 10, 64)
}
if err != nil {
resp.Msg = err.Error()
return
}
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
resp.Msg = err.Error()
return
}
verAll, _ := strconv.ParseBool(r.FormValue("verAll"))
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
req := &LookupReq{
PartitionID: pid,
ParentID: pIno,
Name: name,
VerSeq: verSeq,
VerAll: verAll,
}
p := &Packet{}
if err = mp.Lookup(req, p); err != nil {
resp.Code = http.StatusSeeOther
resp.Msg = err.Error()
return
}
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
}
return
}
func (m *MetaNode) getTxHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getTxHandler] response %s", err)
}
}()
var (
pid uint64
txId string
err error
)
if pid, err = strconv.ParseUint(r.FormValue("pid"), 10, 64); err == nil {
if txId = r.FormValue("txId"); txId == "" {
err = fmt.Errorf("no txId")
}
}
if err != nil {
resp.Msg = err.Error()
return
}
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
req := &proto.TxGetInfoRequest{
Pid: pid,
TxID: txId,
}
p := &Packet{}
if err = mp.TxGetInfo(req, p); err != nil {
resp.Code = http.StatusSeeOther
resp.Msg = err.Error()
return
}
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
}
return
}
func (m *MetaNode) getRealVerSeq(w http.ResponseWriter, r *http.Request) (verSeq uint64, err error) {
if r.FormValue("verSeq") != "" {
var ver int64
if ver, err = strconv.ParseInt(r.FormValue("verSeq"), 10, 64); err != nil {
return
}
verSeq = uint64(ver)
if verSeq == 0 {
verSeq = math.MaxUint64
}
}
return
}
func (m *MetaNode) getAllDentriesHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusSeeOther, "")
shouldSkip := false
defer func() {
if !shouldSkip {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getAllDentriesHandler] response %s", err)
}
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Code = http.StatusBadRequest
resp.Msg = err.Error()
return
}
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
resp.Msg = err.Error()
return
}
buff := bytes.NewBufferString(`{"code": 200, "msg": "OK", "data":[`)
if _, err := w.Write(buff.Bytes()); err != nil {
return
}
buff.Reset()
var (
val []byte
delimiter = []byte{',', '\n'}
isFirst = true
)
mp.GetDentryTree().Ascend(func(i BtreeItem) bool {
den, _ := i.(*Dentry).getDentryFromVerList(verSeq, false)
if den == nil || den.isDeleted() {
return true
}
if !isFirst {
if _, err = w.Write(delimiter); err != nil {
return false
}
} else {
isFirst = false
}
val, err = json.Marshal(den)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte(err.Error()))
return false
}
if _, err = w.Write(val); err != nil {
return false
}
return true
})
shouldSkip = true
buff.WriteString(`]}`)
if _, err = w.Write(buff.Bytes()); err != nil {
log.LogErrorf("[getAllDentriesHandler] response %s", err)
}
return
}
func (m *MetaNode) getAllTxHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusOK, "")
shouldSkip := false
defer func() {
if !shouldSkip {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getAllTxHandler] response %s", err)
}
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Code = http.StatusBadRequest
resp.Msg = err.Error()
return
}
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
buff := bytes.NewBufferString(`{"code": 200, "msg": "OK", "data":[`)
if _, err := w.Write(buff.Bytes()); err != nil {
return
}
buff.Reset()
var (
val []byte
delimiter = []byte{',', '\n'}
isFirst = true
)
f := func(i BtreeItem) bool {
if !isFirst {
if _, err = w.Write(delimiter); err != nil {
return false
}
} else {
isFirst = false
}
if ino, ok := i.(*TxRollbackInode); ok {
_, err = w.Write([]byte(ino.ToString()))
if err != nil {
return false
}
return true
}
if den, ok := i.(*TxRollbackDentry); ok {
_, err = w.Write([]byte(den.ToString()))
if err != nil {
return false
}
return true
}
val, err = json.Marshal(i)
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
w.Write([]byte(err.Error()))
return false
}
if _, err = w.Write(val); err != nil {
return false
}
return true
}
txTree, rbInoTree, rbDenTree := mp.TxGetTree()
txTree.Ascend(f)
rbInoTree.Ascend(f)
rbDenTree.Ascend(f)
shouldSkip = true
buff.WriteString(`]}`)
if _, err = w.Write(buff.Bytes()); err != nil {
log.LogErrorf("[getAllTxHandler] response %s", err)
}
return
}
func (m *MetaNode) getDirectoryHandler(w http.ResponseWriter, r *http.Request) {
resp := NewAPIResponse(http.StatusBadRequest, "")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[getDirectoryHandler] response %s", err)
}
}()
pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
pIno, err := strconv.ParseUint(r.FormValue("parentIno"), 10, 64)
if err != nil {
resp.Msg = err.Error()
return
}
verSeq, err := m.getRealVerSeq(w, r)
if err != nil {
resp.Msg = err.Error()
return
}
mp, err := m.metadataManager.GetPartition(pid)
if err != nil {
resp.Code = http.StatusNotFound
resp.Msg = err.Error()
return
}
req := ReadDirReq{
ParentID: pIno,
VerSeq: verSeq,
}
p := &Packet{}
if err = mp.ReadDir(&req, p); err != nil {
resp.Code = http.StatusInternalServerError
resp.Msg = err.Error()
return
}
resp.Code = http.StatusSeeOther
resp.Msg = p.GetResultMsg()
if len(p.Data) > 0 {
resp.Data = json.RawMessage(p.Data)
}
return
}
func (m *MetaNode) genClusterVersionFileHandler(w http.ResponseWriter, r *http.Request) {
r.ParseForm()
resp := NewAPIResponse(http.StatusOK, "Generate cluster version file success")
defer func() {
data, _ := resp.Marshal()
if _, err := w.Write(data); err != nil {
log.LogErrorf("[genClusterVersionFileHandler] response %s", err)
}
}()
paths := make([]string, 0)
paths = append(paths, m.metadataDir, m.raftDir)
for _, p := range paths {
if _, err := os.Stat(path.Join(p, config.ClusterVersionFile)); err == nil || os.IsExist(err) {
resp.Code = http.StatusCreated
resp.Msg = "Cluster version file already exists in " + p
return
}
}
for _, p := range paths {
if err := config.CheckOrStoreClusterUuid(p, m.clusterUuid, true); err != nil {
resp.Code = http.StatusInternalServerError
resp.Msg = "Failed to create cluster version file in " + p
return
}
}
return
}
func (m *MetaNode) getInodeSnapshotHandler(w http.ResponseWriter, r *http.Request) {
m.getSnapshotHandler(w, r, inodeFile)
}
func (m *MetaNode) getDentrySnapshotHandler(w http.ResponseWriter, r *http.Request) {
m.getSnapshotHandler(w, r, dentryFile)
}
func (m *MetaNode) getSnapshotHandler(w http.ResponseWriter, r *http.Request, file string) {
var err error
defer func() {
if err != nil {
msg := fmt.Sprintf("[getInodeSnapshotHandler] err(%v)", err)
log.LogErrorf("%s", msg)
if _, e := w.Write([]byte(msg)); e != nil {
log.LogErrorf("[getInodeSnapshotHandler] failed to write response: err(%v) msg(%v)", e, msg)
}
}
}()
if err = r.ParseForm(); err != nil {
return
}
id, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
if err != nil {
return
}
mp, err := m.metadataManager.GetPartition(id)
if err != nil {
return
}
filename := path.Join(mp.GetBaseConfig().RootDir, snapshotDir, file)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[getInodeSnapshotHandler] Stat: %s", err.Error())
return
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[getInodeSnapshotHandler] OpenFile: %s", err.Error())
return
}
defer fp.Close()
_, err = io.Copy(w, fp)
if err != nil {
err = errors.NewErrorf("[getInodeSnapshotHandler] copy: %s", err.Error())
return
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"sync"
"github.com/cubefs/cubefs/util/btree"
)
const defaultBTreeDegree = 32
type (
// BtreeItem type alias google btree Item
BtreeItem = btree.Item
)
// BTree is the wrapper of Google's btree.
type BTree struct {
sync.RWMutex
tree *btree.BTree
}
// NewBtree creates a new btree.
func NewBtree() *BTree {
return &BTree{
tree: btree.New(defaultBTreeDegree),
}
}
// Get returns the object of the given key in the btree.
func (b *BTree) Get(key BtreeItem) (item BtreeItem) {
b.RLock()
item = b.tree.Get(key)
b.RUnlock()
return
}
func (b *BTree) CopyGet(key BtreeItem) (item BtreeItem) {
b.Lock()
item = b.tree.CopyGet(key)
b.Unlock()
return
}
// Find searches for the given key in the btree.
func (b *BTree) Find(key BtreeItem, fn func(i BtreeItem)) {
b.RLock()
item := b.tree.Get(key)
b.RUnlock()
if item == nil {
return
}
fn(item)
}
func (b *BTree) CopyFind(key BtreeItem, fn func(i BtreeItem)) {
b.Lock()
item := b.tree.CopyGet(key)
fn(item)
b.Unlock()
}
// Has checks if the key exists in the btree.
func (b *BTree) Has(key BtreeItem) (ok bool) {
b.RLock()
ok = b.tree.Has(key)
b.RUnlock()
return
}
// Delete deletes the object by the given key.
func (b *BTree) Delete(key BtreeItem) (item BtreeItem) {
b.Lock()
item = b.tree.Delete(key)
b.Unlock()
return
}
func (b *BTree) Execute(fn func(tree *btree.BTree) interface{}) interface{} {
b.Lock()
defer b.Unlock()
return fn(b.tree)
}
// ReplaceOrInsert is the wrapper of google's btree ReplaceOrInsert.
func (b *BTree) ReplaceOrInsert(key BtreeItem, replace bool) (item BtreeItem, ok bool) {
b.Lock()
if replace {
item = b.tree.ReplaceOrInsert(key)
b.Unlock()
ok = true
return
}
item = b.tree.Get(key)
if item == nil {
item = b.tree.ReplaceOrInsert(key)
b.Unlock()
ok = true
return
}
ok = false
b.Unlock()
return
}
// Ascend is the wrapper of the google's btree Ascend.
// This function scans the entire btree. When the data is huge, it is not recommended to use this function online.
// Instead, it is recommended to call GetTree to obtain the snapshot of the current btree, and then do the scan on the snapshot.
func (b *BTree) Ascend(fn func(i BtreeItem) bool) {
b.RLock()
b.tree.Ascend(fn)
b.RUnlock()
}
// AscendRange is the wrapper of the google's btree AscendRange.
func (b *BTree) AscendRange(greaterOrEqual, lessThan BtreeItem, iterator func(i BtreeItem) bool) {
b.RLock()
b.tree.AscendRange(greaterOrEqual, lessThan, iterator)
b.RUnlock()
}
// AscendGreaterOrEqual is the wrapper of the google's btree AscendGreaterOrEqual
func (b *BTree) AscendGreaterOrEqual(pivot BtreeItem, iterator func(i BtreeItem) bool) {
b.RLock()
b.tree.AscendGreaterOrEqual(pivot, iterator)
b.RUnlock()
}
// GetTree returns the snapshot of a btree.
func (b *BTree) GetTree() *BTree {
b.Lock()
t := b.tree.Clone()
b.Unlock()
nb := NewBtree()
nb.tree = t
return nb
}
// Reset resets the current btree.
func (b *BTree) Reset() {
b.Lock()
b.tree.Clear(true)
b.Unlock()
}
// Len returns the total number of items in the btree.
func (b *BTree) Len() (size int) {
b.RLock()
size = b.tree.Len()
b.RUnlock()
return
}
// MaxItem returns the largest item in the btree.
func (b *BTree) MaxItem() BtreeItem {
b.RLock()
item := b.tree.Max()
b.RUnlock()
return item
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"strings"
"sync"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
// DataPartition defines the struct of data partition that will be used on the meta node.
type DataPartition struct {
PartitionID uint64
Status int8
ReplicaNum uint8
PartitionType string
Hosts []string
IsDiscard bool
}
// GetAllAddrs returns all addresses of the data partition.
func (dp *DataPartition) GetAllAddrs() (m string) {
return strings.Join(dp.Hosts[1:], proto.AddrSplit) + proto.AddrSplit
}
// DataPartitionsView defines the view of the data node.
type DataPartitionsView struct {
DataPartitions []*DataPartition
}
func NewDataPartitionsView() *DataPartitionsView {
return &DataPartitionsView{}
}
// Vol defines the view of the data partition with the read/write lock.
type Vol struct {
sync.RWMutex
dataPartitionView map[uint64]*DataPartition
volDeleteLockTime int64
}
// NewVol returns a new volume instance.
func NewVol() *Vol {
return &Vol{
dataPartitionView: make(map[uint64]*DataPartition),
}
}
// GetPartition returns the data partition based on the given partition ID.
func (v *Vol) GetPartition(partitionID uint64) *DataPartition {
v.RLock()
defer v.RUnlock()
return v.dataPartitionView[partitionID]
}
// UpdatePartitions updates the data partition.
func (v *Vol) UpdatePartitions(partitions *DataPartitionsView) {
for _, dp := range partitions.DataPartitions {
log.LogDebugf("action[UpdatePartitions] dp (id:%v,status:%v)", dp.PartitionID, dp.Status)
v.replaceOrInsert(dp)
}
}
func (v *Vol) replaceOrInsert(partition *DataPartition) {
v.Lock()
defer v.Unlock()
v.dataPartitionView[partition.PartitionID] = partition
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"fmt"
"math"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
// Dentry wraps necessary properties of the `dentry` information in file system.
// Marshal exporterKey:
// +-------+----------+------+
// | item | ParentId | Name |
// +-------+----------+------+
// | bytes | 8 | rest |
// +-------+----------+------+
// Marshal value:
// +-------+-------+------+
// | item | Inode | Type |
// +-------+-------+------+
// | bytes | 8 | 4 |
// +-------+-------+------+
// Marshal entity:
// +-------+-----------+--------------+-----------+--------------+
// | item | KeyLength | MarshaledKey | ValLength | MarshaledVal |
// +-------+-----------+--------------+-----------+--------------+
// | bytes | 4 | KeyLength | 4 | ValLength |
// +-------+-----------+--------------+-----------+--------------+
type DentryMultiSnap struct {
VerSeq uint64
dentryList DentryBatch
}
type Dentry struct {
ParentId uint64 // FileID value of the parent inode.
Name string // Name of the current dentry.
Inode uint64 // FileID value of the current inode.
Type uint32
// snapshot
multiSnap *DentryMultiSnap
}
func NewDentrySnap(seq uint64) *DentryMultiSnap {
return &DentryMultiSnap{
VerSeq: seq,
}
}
func (d *Dentry) getSnapListLen() int {
if d.multiSnap == nil {
return 0
}
return len(d.multiSnap.dentryList)
}
func (d *Dentry) addVersion(ver uint64) {
dn := d.CopyDirectly().(*Dentry)
dn.setVerSeq(d.getSeqFiled())
d.setVerSeq(ver)
d.multiSnap.dentryList = append([]*Dentry{dn}, d.multiSnap.dentryList...)
}
func (d *Dentry) setVerSeq(verSeq uint64) {
if verSeq == 0 {
return
}
if d.multiSnap == nil {
d.multiSnap = NewDentrySnap(verSeq)
} else {
d.multiSnap.VerSeq = verSeq
}
}
func (d *Dentry) getSeqFiled() (verSeq uint64) {
if d.multiSnap == nil {
return 0
}
return d.multiSnap.VerSeq
}
func isSeqEqual(ver_1 uint64, ver_2 uint64) bool {
if isInitSnapVer(ver_1) {
ver_1 = 0
}
if isInitSnapVer(ver_2) {
ver_2 = 0
}
return (ver_1 & math.MaxInt64) == (ver_2 & math.MaxInt64)
}
func (d *Dentry) getVerSeq() (verSeq uint64) {
if d.multiSnap == nil {
return 0
}
return d.multiSnap.VerSeq & math.MaxInt64
}
func (d *Dentry) isDeleted() bool {
if d.multiSnap == nil {
return false
}
return (d.multiSnap.VerSeq >> 63) != 0
}
func (d *Dentry) setDeleted() {
if d.multiSnap == nil {
log.LogErrorf("action[setDeleted] d %v be set deleted not found multiSnap", d)
return
}
log.LogDebugf("action[setDeleted] d %v be set deleted", d)
d.multiSnap.VerSeq |= uint64(1) << 63
}
func (d *Dentry) minimizeSeq() (verSeq uint64) {
cnt := d.getSnapListLen()
if cnt == 0 {
return d.getVerSeq()
}
return d.multiSnap.dentryList[cnt-1].getVerSeq()
}
func (d *Dentry) isEffective(verSeq uint64) bool {
if verSeq == 0 {
return false
}
if isInitSnapVer(verSeq) {
verSeq = 0
}
return verSeq >= d.minimizeSeq()
}
// isHit return the right version or else return the version can be seen
func (d *Dentry) getDentryFromVerList(verSeq uint64, isHit bool) (den *Dentry, idx int) {
if verSeq == 0 || (verSeq >= d.getVerSeq() && !isInitSnapVer(verSeq)) {
if d.isDeleted() {
log.LogDebugf("action[getDentryFromVerList] tmp dentry %v, is deleted, seq [%v]", d, d.getVerSeq())
return
}
return d, 0
}
// read the oldest version snapshot,the oldest version is 0 should make a different with the lastest uncommit version read(with seq 0)
if isInitSnapVer(verSeq) {
if d.getVerSeq() == 0 {
return d, 0
}
denListLen := d.getSnapListLen()
if denListLen == 0 {
return
}
den = d.multiSnap.dentryList[denListLen-1]
if d.multiSnap.dentryList[denListLen-1].getVerSeq() != 0 || d.multiSnap.dentryList[denListLen-1].isDeleted() {
return nil, 0
}
return den, denListLen
}
if d.multiSnap == nil {
return
}
for id, lDen := range d.multiSnap.dentryList {
if verSeq < lDen.getVerSeq() {
log.LogDebugf("action[getDentryFromVerList] den in ver list %v, return nil, request seq [%v], history ver seq [%v]", lDen, verSeq, lDen.getVerSeq())
} else {
if lDen.isDeleted() {
log.LogDebugf("action[getDentryFromVerList] den in ver list %v, return nil due to latest is deleted", lDen)
return
}
if isHit && lDen.getVerSeq() != verSeq {
log.LogDebugf("action[getDentryFromVerList] den in ver list %v, return nil due to ver not equal %v vs %v", lDen, lDen.getVerSeq(), verSeq)
return
}
return lDen, id + 1
}
}
log.LogDebugf("action[getDentryFromVerList] den in ver list not found right dentry with seq [%v]", verSeq)
return
}
func (d *Dentry) getLastestVer(reqVerSeq uint64, commit bool, verlist []*proto.VolVersionInfo) (uint64, bool) {
if len(verlist) == 0 {
return 0, false
}
for id, info := range verlist {
if commit && id == len(verlist)-1 {
break
}
if info.Ver >= reqVerSeq { // include reqSeq itself
return info.Ver, true
}
}
log.LogDebugf("action[getLastestVer] inode[%v] reqVerseq [%v] not found, the largetst one %v",
d.Inode, reqVerSeq, verlist[len(verlist)-1].Ver)
return 0, false
}
func (d *Dentry) deleteTopLayer(mpVerSeq uint64) (rd *Dentry, dmore bool, clean bool) {
if d.isDeleted() {
log.LogDebugf("action[deleteTopLayer.delSeq_0] do noting dentry %v seq 0 be deleted before", d)
return nil, false, false
}
// if there's no snapshot itself, nor have snapshot after dentry's ver then need unlink directly and make no snapshot
// just move to upper layer,the request snapshot be dropped
if d.getSnapListLen() == 0 {
if d.getVerSeq() == mpVerSeq {
// operate dentry directly
log.LogDebugf("action[deleteTopLayer.delSeq_0] no snapshot depend on this dentry,could drop seq 0 dentry %v", d)
return d, true, true
}
}
if d.getVerSeq() < mpVerSeq {
dn := d.CopyDirectly()
dn.(*Dentry).setVerSeq(d.getVerSeq())
d.setVerSeq(mpVerSeq)
d.multiSnap.dentryList = append([]*Dentry{dn.(*Dentry)}, d.multiSnap.dentryList...)
log.LogDebugf("action[deleteTopLayer.delSeq_0] create version and push to dentry list. dentry %v", dn.(*Dentry))
} else {
d.setVerSeq(mpVerSeq)
}
d.setVerSeq(mpVerSeq)
d.setDeleted() // denParm create at the same version.no need to push to history list
log.LogDebugf("action[deleteTopLayer.delSeq_0] den %v be set deleted at version seq [%v]", d, mpVerSeq)
return d, true, false
}
func (d *Dentry) updateTopLayerSeq(delVerSeq uint64, verlist []*proto.VolVersionInfo) (rd *Dentry, dmore bool, clean bool) {
if !isSeqEqual(delVerSeq, d.getVerSeq()) {
// header layer do nothing and be depends on should not be dropped
log.LogDebugf("action[updateTopLayerSeq.inSnapList_del_%v] den %v first layer do nothing", delVerSeq, d)
return d, false, false
}
for _, info := range verlist {
if info.Ver > d.getVerSeq() {
d.setVerSeq(info.Ver)
return d, false, false
}
}
return d, true, true
}
func (d *Dentry) cleanDeletedVersion(index int) (bDrop bool) {
if index == 0 {
if len(d.multiSnap.dentryList) == 0 && d.isDeleted() {
bDrop = true
}
return
}
delIdx := index - 1
if !d.multiSnap.dentryList[delIdx].isDeleted() {
return
}
// del the dentry before
log.LogDebugf("ction[cleanDeleteVersion] dentry (%v) delete the last seq [%v] which set deleted before",
d, d.multiSnap.dentryList[delIdx].getVerSeq())
d.multiSnap.dentryList = append(d.multiSnap.dentryList[:delIdx], d.multiSnap.dentryList[:delIdx+1]...)
if len(d.multiSnap.dentryList) == 0 && d.isDeleted() {
log.LogDebugf("ction[cleanDeleteVersion] dentry (%v) require to be deleted", d)
bDrop = true
}
return
}
// the lastest dentry may be deleted before and set status DentryDeleted,
// the scope of deleted happened from the DentryDeleted flag owner(include in) to the file with the same name be created is invisible,
// if create anther dentry with larger verSeq, put the deleted dentry to the history list.
// return doMore bool.True means need do next step on caller such as unlink parentIO
func (d *Dentry) deleteVerSnapshot(delVerSeq uint64, mpVerSeq uint64, verlist []*proto.VolVersionInfo) (rd *Dentry, dmore bool, clean bool) { // bool is doMore
log.LogDebugf("action[deleteVerSnapshot] enter.dentry %v delVerseq [%v] mpver [%v] verList %v", d, delVerSeq, mpVerSeq, verlist)
// create denParm version
if !isInitSnapVer(delVerSeq) && delVerSeq > mpVerSeq {
panic(fmt.Sprintf("Dentry version %v large than mp[%v]", delVerSeq, mpVerSeq))
}
if delVerSeq == 0 {
return d.deleteTopLayer(mpVerSeq)
} else {
var (
idx int
den *Dentry
endSeq uint64
)
if den, idx = d.getDentryFromVerList(delVerSeq, true); den == nil {
log.LogDebugf("action[deleteVerSnapshot.inSnapList_del_%v] den %v not found", delVerSeq, d)
return nil, false, false
}
if idx == 0 { // top layer
return d.updateTopLayerSeq(delVerSeq, verlist)
}
// if any alive snapshot in mp dimension exist in seq scope from den to next ascend neighbor, dio snapshot be keep or else drop
startSeq := den.getVerSeq()
realIdx := idx - 1 // index in history list layer
if realIdx == 0 {
endSeq = d.getVerSeq()
} else {
endSeq = d.multiSnap.dentryList[realIdx-1].getVerSeq()
if d.multiSnap.dentryList[realIdx-1].isDeleted() {
log.LogInfof("action[deleteVerSnapshot.inSnapList_del_%v] inode[%v] layer %v name %v be deleted already!",
delVerSeq, d.Inode, realIdx, d.multiSnap.dentryList[realIdx-1].Name)
}
}
log.LogDebugf("action[deleteVerSnapshot.inSnapList_del_%v] inode[%v] try drop multiVersion idx %v effective seq scope [%v,%v) ", delVerSeq,
d.Inode, realIdx, den.getVerSeq(), endSeq)
for _, info := range verlist {
if info.Ver >= startSeq && info.Ver < endSeq { // the version itself not include in
log.LogDebugf("action[deleteVerSnapshotInList.inSnapList_del_%v] inode[%v] dir layer idx %v include snapshot %v.don't drop", delVerSeq, den.Inode, realIdx, info.Ver)
// there's some snapshot depends on the version trying to be deleted,
// keep it,all the snapshots which depends on this version will reach here when make snapshot delete, and found the scope is minimized
// other versions depends upon this version will be found zero finally after deletions and do clean
den.setVerSeq(info.Ver)
return den, false, false
}
if info.Ver >= endSeq {
break
}
log.LogDebugf("action[deleteVerSnapshotInList.inSnapList_del_%v] inode[%v] try drop scope [%v, %v), mp ver [%v] not suitable",
delVerSeq, den.Inode, den.getVerSeq(), endSeq, info.Ver)
}
log.LogDebugf("action[deleteVerSnapshotInList.inSnapList_del_%v] inode[%v] try drop multiVersion idx %v", delVerSeq, den.Inode, realIdx)
d.multiSnap.dentryList = append(d.multiSnap.dentryList[:realIdx], d.multiSnap.dentryList[realIdx+1:]...)
if d.cleanDeletedVersion(realIdx) {
return den, true, true
}
return den, false, false
}
}
func (d *Dentry) String() string {
str := fmt.Sprintf("dentry(name:[%v],parentId:[%v],inode:[%v],type:[%v],seq:[%v],isDeleted:[%v],dentryList_len[%v])",
d.Name, d.ParentId, d.Inode, d.Type, d.getVerSeq(), d.isDeleted(), d.getSnapListLen())
if d.getSnapListLen() > 0 {
for idx, den := range d.multiSnap.dentryList {
str += fmt.Sprintf("idx:%v,content(%v))", idx, den)
}
}
return str
}
type TxDentry struct {
// ParInode *Inode
Dentry *Dentry
TxInfo *proto.TransactionInfo
}
func NewTxDentry(parentID uint64, name string, ino uint64, mode uint32, parInode *Inode, txInfo *proto.TransactionInfo) *TxDentry {
dentry := &Dentry{
ParentId: parentID,
Name: name,
Inode: ino,
Type: mode,
}
txDentry := &TxDentry{
// ParInode: parInode,
Dentry: dentry,
TxInfo: txInfo,
}
return txDentry
}
func (td *TxDentry) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0))
//bs, err := td.ParInode.Marshal()
//if err != nil {
// return nil, err
//}
//if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
// return nil, err
//}
//if _, err := buff.Write(bs); err != nil {
// return nil, err
//}
bs, err := td.Dentry.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
bs, err = td.TxInfo.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
result = buff.Bytes()
return
}
func (td *TxDentry) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
var dataLen uint32
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
dentry := &Dentry{}
if err = dentry.Unmarshal(data); err != nil {
return
}
td.Dentry = dentry
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data = make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
if err = txInfo.Unmarshal(data); err != nil {
return
}
td.TxInfo = txInfo
return
}
type TxUpdateDentry struct {
OldDentry *Dentry
NewDentry *Dentry
TxInfo *proto.TransactionInfo
}
func NewTxUpdateDentry(oldDentry *Dentry, newDentry *Dentry, txInfo *proto.TransactionInfo) *TxUpdateDentry {
txUpdateDentry := &TxUpdateDentry{
OldDentry: oldDentry,
NewDentry: newDentry,
TxInfo: txInfo,
}
return txUpdateDentry
}
func (td *TxUpdateDentry) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0))
bs, err := td.OldDentry.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
bs, err = td.NewDentry.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
bs, err = td.TxInfo.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
result = buff.Bytes()
return
}
func (td *TxUpdateDentry) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
var dataLen uint32
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
oldDentry := &Dentry{}
if err = oldDentry.Unmarshal(data); err != nil {
return
}
td.OldDentry = oldDentry
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data = make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
newDentry := &Dentry{}
if err = newDentry.Unmarshal(data); err != nil {
return
}
td.NewDentry = newDentry
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data = make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
if err = txInfo.Unmarshal(data); err != nil {
return
}
td.TxInfo = txInfo
return
}
type DentryBatch []*Dentry
// todo(leon chang), buffer need alloc first before and write directly consider the space and performance
// Marshal marshals a dentry into a byte array.
func (d *Dentry) Marshal() (result []byte, err error) {
keyBytes := d.MarshalKey()
valBytes := d.MarshalValue()
keyLen := uint32(len(keyBytes))
valLen := uint32(len(valBytes))
buff := bytes.NewBuffer(make([]byte, 0))
buff.Grow(int(keyLen + valLen + 8))
if err = binary.Write(buff, binary.BigEndian, keyLen); err != nil {
return
}
if _, err = buff.Write(keyBytes); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, valLen); err != nil {
return nil, err
}
if _, err = buff.Write(valBytes); err != nil {
return
}
result = buff.Bytes()
return
}
// Unmarshal unmarshals the dentry from a byte array.
func (d *Dentry) Unmarshal(raw []byte) (err error) {
var (
keyLen uint32
valLen uint32
)
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &keyLen); err != nil {
return
}
keyBytes := make([]byte, keyLen)
if _, err = buff.Read(keyBytes); err != nil {
return
}
if err = d.UnmarshalKey(keyBytes); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &valLen); err != nil {
return
}
valBytes := make([]byte, valLen)
if _, err = buff.Read(valBytes); err != nil {
return
}
err = d.UnmarshalValue(valBytes)
return
}
// Marshal marshals the dentryBatch into a byte array.
func (d DentryBatch) Marshal() ([]byte, error) {
buff := bytes.NewBuffer(make([]byte, 0))
if err := binary.Write(buff, binary.BigEndian, uint32(len(d))); err != nil {
return nil, err
}
for _, dentry := range d {
bs, err := dentry.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
}
return buff.Bytes(), nil
}
// Unmarshal unmarshals the dentryBatch.
func DentryBatchUnmarshal(raw []byte) (DentryBatch, error) {
buff := bytes.NewBuffer(raw)
var batchLen uint32
if err := binary.Read(buff, binary.BigEndian, &batchLen); err != nil {
return nil, err
}
result := make(DentryBatch, 0, int(batchLen))
var dataLen uint32
for j := 0; j < int(batchLen); j++ {
if err := binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return nil, err
}
data := make([]byte, int(dataLen))
if _, err := buff.Read(data); err != nil {
return nil, err
}
den := &Dentry{}
if err := den.Unmarshal(data); err != nil {
return nil, err
}
result = append(result, den)
}
return result, nil
}
// Less tests whether the current dentry is less than the given one.
// This method is necessary fot B-Tree item implementation.
func (d *Dentry) Less(than BtreeItem) (less bool) {
dentry, ok := than.(*Dentry)
less = ok && ((d.ParentId < dentry.ParentId) || ((d.ParentId == dentry.ParentId) && (d.Name < dentry.Name)))
return
}
func (d *Dentry) CopyDirectly() BtreeItem {
newDentry := *d
newDentry.multiSnap = nil
return &newDentry
}
func (d *Dentry) Copy() BtreeItem {
newDentry := *d
if d.multiSnap != nil {
newDentry.multiSnap = &DentryMultiSnap{
VerSeq: d.multiSnap.VerSeq,
dentryList: d.multiSnap.dentryList,
}
}
return &newDentry
}
// MarshalKey is the bytes version of the MarshalKey method which returns the byte slice result.
func (d *Dentry) MarshalKey() (k []byte) {
buff := bytes.NewBuffer(make([]byte, 0))
buff.Grow(32)
if err := binary.Write(buff, binary.BigEndian, &d.ParentId); err != nil {
panic(err)
}
buff.Write([]byte(d.Name))
k = buff.Bytes()
return
}
// UnmarshalKey unmarshals the exporterKey from bytes.
func (d *Dentry) UnmarshalKey(k []byte) (err error) {
buff := bytes.NewBuffer(k)
if err = binary.Read(buff, binary.BigEndian, &d.ParentId); err != nil {
return
}
d.Name = string(buff.Bytes())
return
}
func (d *Dentry) MarshalValue() []byte {
buff := bytes.NewBuffer(nil)
buff.Grow(24 + d.getSnapListLen()*20)
writeBinary := func(data interface{}) {
if err := binary.Write(buff, binary.BigEndian, data); err != nil {
panic(err)
}
}
writeBinary(&d.Inode)
writeBinary(&d.Type)
seq := d.getSeqFiled()
if seq == 0 {
return buff.Bytes()
}
writeBinary(&seq)
verCnt := uint32(d.getSnapListLen())
writeBinary(&verCnt)
if d.getSnapListLen() > 0 {
for _, dd := range d.multiSnap.dentryList {
writeBinary(&dd.Inode)
writeBinary(&dd.Type)
seq = dd.getSeqFiled()
writeBinary(&seq)
}
}
return buff.Bytes()
}
func (d *Dentry) UnmarshalValue(val []byte) (err error) {
buff := bytes.NewBuffer(val)
if err = binary.Read(buff, binary.BigEndian, &d.Inode); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &d.Type); err != nil {
return
}
if len(val) >= 24 {
var seq uint64
if err = binary.Read(buff, binary.BigEndian, &seq); err != nil {
return
}
d.multiSnap = NewDentrySnap(seq)
verCnt := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &verCnt); err != nil {
return
}
for i := 0; i < int(verCnt); i++ {
// todo(leonchang) name and parentid should be removed to reduce space
den := &Dentry{
Name: d.Name,
ParentId: d.ParentId,
}
if err = binary.Read(buff, binary.BigEndian, &den.Inode); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &den.Type); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &seq); err != nil {
return
}
if seq > 0 {
den.multiSnap = NewDentrySnap(seq)
}
d.multiSnap.dentryList = append(d.multiSnap.dentryList, den)
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"fmt"
"sync"
"github.com/cubefs/cubefs/util/btree"
)
type ExtentVal struct {
dataMap map[string][]byte
verSeq uint64
}
type Extend struct {
inode uint64
dataMap map[string][]byte
verSeq uint64
multiVers []*Extend
versionMu sync.RWMutex
mu sync.RWMutex
}
func (e *Extend) checkSequence() (err error) {
e.versionMu.RLock()
defer e.versionMu.RUnlock()
lastSeq := e.verSeq
for id, extend := range e.multiVers {
if lastSeq <= extend.verSeq {
return fmt.Errorf("id[%v] seq [%v] not less than last seq [%v]", id, extend.verSeq, lastSeq)
}
}
return
}
func (e *Extend) GetMinVer() uint64 {
if len(e.multiVers) == 0 {
return e.verSeq
}
return e.multiVers[len(e.multiVers)-1].verSeq
}
func (e *Extend) GetExtentByVersion(ver uint64) (extend *Extend) {
if ver == 0 {
return e
}
if isInitSnapVer(ver) {
if e.GetMinVer() != 0 {
return nil
}
return e.multiVers[len(e.multiVers)-1]
}
e.versionMu.RLock()
defer e.versionMu.RUnlock()
for i := 0; i < len(e.multiVers)-1; i++ {
if e.multiVers[i].verSeq <= ver {
return e.multiVers[i]
}
}
return
}
func NewExtend(inode uint64) *Extend {
return &Extend{inode: inode, dataMap: make(map[string][]byte)}
}
func NewExtendFromBytes(raw []byte) (*Extend, error) {
var err error
buffer := bytes.NewBuffer(raw)
// decode inode
var inode uint64
if inode, err = binary.ReadUvarint(buffer); err != nil {
return nil, err
}
ext := NewExtend(inode)
// decode number of key-value pairs
var numKV uint64
if numKV, err = binary.ReadUvarint(buffer); err != nil {
return nil, err
}
readBytes := func() ([]byte, error) {
var length uint64
if length, err = binary.ReadUvarint(buffer); err != nil {
return nil, err
}
data := make([]byte, length)
if _, err = buffer.Read(data); err != nil {
return nil, err
}
return data, nil
}
for i := 0; i < int(numKV); i++ {
var k, v []byte
if k, err = readBytes(); err != nil {
return nil, err
}
if v, err = readBytes(); err != nil {
return nil, err
}
ext.Put(k, v, 0)
}
if buffer.Len() > 0 {
// read verSeq
verSeq, err := binary.ReadUvarint(buffer)
if err != nil {
return nil, err
}
ext.verSeq = verSeq
// read number of multiVers
numMultiVers, err := binary.ReadUvarint(buffer)
if err != nil {
return nil, err
}
if numMultiVers > 0 {
// read each multiVers
ext.multiVers = make([]*Extend, numMultiVers)
for i := uint64(0); i < numMultiVers; i++ {
// read multiVers length
mvLen, err := binary.ReadUvarint(buffer)
if err != nil {
return nil, err
}
mvBytes := make([]byte, mvLen)
if _, err = buffer.Read(mvBytes); err != nil {
return nil, err
}
// recursively decode multiVers
mv, err := NewExtendFromBytes(mvBytes)
if err != nil {
return nil, err
}
ext.multiVers[i] = mv
}
}
}
return ext, nil
}
func (e *Extend) Less(than btree.Item) bool {
ext, is := than.(*Extend)
return is && e.inode < ext.inode
}
func (e *Extend) Put(key, value []byte, verSeq uint64) {
e.mu.Lock()
defer e.mu.Unlock()
e.dataMap[string(key)] = value
e.verSeq = verSeq
}
func (e *Extend) Get(key []byte) (value []byte, exist bool) {
e.mu.RLock()
defer e.mu.RUnlock()
value, exist = e.dataMap[string(key)]
return
}
func (e *Extend) Remove(key []byte) {
e.mu.Lock()
defer e.mu.Unlock()
delete(e.dataMap, string(key))
return
}
func (e *Extend) Range(visitor func(key, value []byte) bool) {
e.mu.RLock()
defer e.mu.RUnlock()
for k, v := range e.dataMap {
if !visitor([]byte(k), v) {
return
}
}
}
func (e *Extend) Merge(o *Extend, override bool) {
e.mu.Lock()
defer e.mu.Unlock()
o.Range(func(key, value []byte) bool {
strKey := string(key)
if _, exist := e.dataMap[strKey]; override || !exist {
copied := make([]byte, len(value))
copy(copied, value)
e.dataMap[strKey] = copied
}
return true
})
}
func (e *Extend) Copy() btree.Item {
newExt := NewExtend(e.inode)
e.mu.RLock()
defer e.mu.RUnlock()
for k, v := range e.dataMap {
newExt.dataMap[k] = v
}
newExt.verSeq = e.verSeq
newExt.multiVers = e.multiVers
return newExt
}
func (e *Extend) Bytes() ([]byte, error) {
var err error
e.mu.RLock()
defer e.mu.RUnlock()
var n int
tmp := make([]byte, binary.MaxVarintLen64)
buffer := bytes.NewBuffer(nil)
// write inode with varint codec
n = binary.PutUvarint(tmp, e.inode)
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// write number of key-value pairs
n = binary.PutUvarint(tmp, uint64(len(e.dataMap)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// write key-value paris
writeBytes := func(val []byte) error {
n = binary.PutUvarint(tmp, uint64(len(val)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return err
}
if _, err = buffer.Write(val); err != nil {
return err
}
return nil
}
for k, v := range e.dataMap {
// key
if err = writeBytes([]byte(k)); err != nil {
return nil, err
}
// value
if err = writeBytes(v); err != nil {
return nil, err
}
}
if e.verSeq > 0 {
// write verSeq
verSeqBytes := make([]byte, binary.MaxVarintLen64)
verSeqLen := binary.PutUvarint(verSeqBytes, e.verSeq)
if _, err = buffer.Write(verSeqBytes[:verSeqLen]); err != nil {
return nil, err
}
// write number of multiVers
n = binary.PutUvarint(tmp, uint64(len(e.multiVers)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// write each multiVers
for _, mv := range e.multiVers {
// write multiVers bytes
mvBytes, err := mv.Bytes()
if err != nil {
return nil, err
}
// write multiVers length
n = binary.PutUvarint(tmp, uint64(len(mvBytes)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// write multiVers bytes
if _, err = buffer.Write(mvBytes); err != nil {
return nil, err
}
}
return buffer.Bytes(), nil
}
return buffer.Bytes(), nil
}
func (e *Extend) GetInode() (inode uint64) {
return e.inode
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"container/list"
"sync"
)
type freeList struct {
sync.Mutex
list *list.List
index map[uint64]*list.Element
}
func newFreeList() *freeList {
return &freeList{
list: list.New(),
index: make(map[uint64]*list.Element),
}
}
// Pop removes the first item on the list and returns it.
func (fl *freeList) Pop() (ino uint64) {
fl.Lock()
defer fl.Unlock()
item := fl.list.Front()
if item == nil {
return
}
val := fl.list.Remove(item)
ino = val.(uint64)
delete(fl.index, ino)
return
}
// Push inserts a new item at the back of the list.
func (fl *freeList) Push(ino uint64) {
fl.Lock()
defer fl.Unlock()
if _, ok := fl.index[ino]; !ok {
item := fl.list.PushBack(ino)
fl.index[ino] = item
}
}
func (fl *freeList) Remove(ino uint64) {
fl.Lock()
defer fl.Unlock()
if item, ok := fl.index[ino]; ok {
fl.list.Remove(item)
delete(fl.index, ino)
}
}
func (fl *freeList) Len() int {
fl.Lock()
defer fl.Unlock()
return len(fl.index)
}
//go:build gofuzz
// +build gofuzz
// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package metanode
import (
fuzz "github.com/AdaLogics/go-fuzz-headers"
)
type InodeParam struct {
Ino uint64
Type uint32
}
func FuzzNewInode(data []byte) int {
f := fuzz.NewConsumer(data)
param := InodeParam{}
err := f.GenerateStruct(¶m)
if err != nil {
return 0
}
ino := NewInode(param.Ino, param.Type)
if ino == nil {
return 0
}
return 1
}
func FuzzNewExtend(data []byte) int {
f := fuzz.NewConsumer(data)
var ino uint64
err := f.GenerateStruct(&ino)
if err != nil {
return 0
}
extend := NewExtend(ino)
if extend == nil {
return 0
}
return 1
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
syslog "log"
"math"
"runtime/debug"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/timeutil"
)
const (
DeleteMarkFlag = 1 << 0
InodeDelTop = 1 << 1
)
var (
// InodeV1Flag uint64 = 0x01
V2EnableColdInodeFlag uint64 = 0x02
V3EnableSnapInodeFlag uint64 = 0x04
)
// Inode wraps necessary properties of `Inode` information in the file system.
// Marshal exporterKey:
// +-------+-------+
// | item | Inode |
// +-------+-------+
// | bytes | 8 |
// +-------+-------+
// Marshal value:
// +-------+------+------+-----+----+----+----+--------+------------------+
// | item | Type | Size | Gen | CT | AT | MT | ExtLen | MarshaledExtents |
// +-------+------+------+-----+----+----+----+--------+------------------+
// | bytes | 4 | 8 | 8 | 8 | 8 | 8 | 4 | ExtLen |
// +-------+------+------+-----+----+----+----+--------+------------------+
// Marshal entity:
// +-------+-----------+--------------+-----------+--------------+
// | item | KeyLength | MarshaledKey | ValLength | MarshaledVal |
// +-------+-----------+--------------+-----------+--------------+
// | bytes | 4 | KeyLength | 4 | ValLength |
// +-------+-----------+--------------+-----------+--------------+
type InodeMultiSnap struct {
verSeq uint64 // latest version be create or modified
multiVersions InodeBatch
ekRefMap *sync.Map
}
type Inode struct {
sync.RWMutex
Inode uint64 // Inode ID
Type uint32
Uid uint32
Gid uint32
Size uint64
Generation uint64
CreateTime int64
AccessTime int64
ModifyTime int64
LinkTarget []byte // SymLink target name
NLink uint32 // NodeLink counts
Flag int32
Reserved uint64 // reserved space
// Extents *ExtentsTree
Extents *SortedExtents
ObjExtents *SortedObjExtents
// Snapshot
multiSnap *InodeMultiSnap
}
func (i *Inode) GetMultiVerString() string {
if i.multiSnap == nil {
return "nil"
}
return fmt.Sprintf("%v", i.multiSnap.multiVersions)
}
func (i *Inode) RangeMultiVer(visitor func(idx int, info *Inode) bool) {
if i.multiSnap == nil {
return
}
for k, v := range i.multiSnap.multiVersions {
if !visitor(k, v) {
break
}
}
}
func isInitSnapVer(seq uint64) bool {
return seq == math.MaxUint64
}
func NewMultiSnap(seq uint64) *InodeMultiSnap {
return &InodeMultiSnap{
verSeq: seq,
}
}
func (i *Inode) verUpdate(seq uint64) {
if seq == 0 && i.multiSnap == nil {
return
}
if i.multiSnap == nil {
i.multiSnap = NewMultiSnap(seq)
} else {
i.multiSnap.verSeq = seq
}
}
func (i *Inode) setVerNoCheck(seq uint64) {
i.verUpdate(seq)
}
func (i *Inode) setVer(seq uint64) {
if i.getVer() > seq {
syslog.Println(fmt.Sprintf("inode[%v] old seq [%v] cann't use seq [%v]", i.getVer(), seq, string(debug.Stack())))
log.LogFatalf("inode[%v] old seq [%v] cann't use seq [%v] stack %v", i.Inode, i.getVer(), seq, string(debug.Stack()))
}
i.verUpdate(seq)
}
func (i *Inode) insertEkRefMap(mpId uint64, ek *proto.ExtentKey) {
if i.multiSnap == nil {
i.multiSnap = NewMultiSnap(i.getVer())
}
if i.multiSnap.ekRefMap == nil {
i.multiSnap.ekRefMap = new(sync.Map)
}
storeEkSplit(mpId, i.Inode, i.multiSnap.ekRefMap, ek)
}
func (i *Inode) isEkInRefMap(mpId uint64, ek *proto.ExtentKey) (ok bool) {
if i.multiSnap == nil {
return
}
if i.multiSnap.ekRefMap == nil {
log.LogErrorf("[storeEkSplit] mpId [%v] inodeID %v ekRef nil", mpId, i.Inode)
return
}
log.LogDebugf("[storeEkSplit] mpId [%v] inode[%v] mp[%v] extent id[%v] ek [%v]", mpId, i.Inode, ek.PartitionId, ek.ExtentId, ek)
id := ek.PartitionId<<32 | ek.ExtentId
_, ok = i.multiSnap.ekRefMap.Load(id)
return
}
func (i *Inode) getVer() uint64 {
if i.multiSnap == nil {
return 0
}
return i.multiSnap.verSeq
}
func (i *Inode) getLayerLen() int {
if i.multiSnap == nil {
return 0
}
return len(i.multiSnap.multiVersions)
}
func (i *Inode) getLayerVer(layer int) uint64 {
if i.multiSnap == nil {
log.LogErrorf("getLayerVer. inode[%v] multi snap nil", i.Inode)
return 0
}
if layer > i.getLayerLen()-1 {
log.LogErrorf("getLayerVer. inode[%v] layer %v not exist, len %v", i.Inode, layer, i.getLayerLen())
return 0
}
if i.multiSnap.multiVersions[layer] == nil {
log.LogErrorf("getLayerVer. inode[%v] layer %v nil", i.Inode, layer)
return 0
}
return i.multiSnap.multiVersions[layer].getVer()
}
func (i *Inode) isEmptyVerList() bool {
return i.getLayerLen() == 0
}
func (i *Inode) isTailIndexInList(id int) bool {
return id == i.getLayerLen()-1
}
func (i *Inode) getTailVerInList() (verSeq uint64, found bool) {
mLen := i.getLayerLen()
if mLen > 0 {
return i.getLayerVer(mLen - 1), true
}
return 0, false
}
// freelist clean inode get all exist extents info, deal special case for split key
func (inode *Inode) GetAllExtsOfflineInode(mpID uint64) (extInfo map[uint64][]*proto.ExtentKey) {
log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] inode.Extents: %v, ino verList: %v",
mpID, inode.Inode, inode.Extents, inode.GetMultiVerString())
extInfo = make(map[uint64][]*proto.ExtentKey)
if inode.getLayerLen() > 0 {
log.LogWarnf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] verlist len %v should not drop",
mpID, inode.Inode, inode.getLayerLen())
}
for i := 0; i < inode.getLayerLen()+1; i++ {
dIno := inode
if i > 0 {
dIno = inode.multiSnap.multiVersions[i-1]
}
log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] dino[%v]", mpID, inode.Inode, dIno)
dIno.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
if ek.IsSplit() {
var (
dOK bool
last bool
)
log.LogDebugf("deleteMarkedInodes DecSplitEk mpID %v inode[%v]", mpID, inode.Inode)
if dOK, last = dIno.DecSplitEk(mpID, &ek); !dOK {
return false
}
if !last {
log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] ek [%v] be removed", mpID, inode.Inode, ek)
return true
}
log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] ek [%v] be removed", mpID, inode.Inode, ek)
}
extInfo[ek.PartitionId] = append(extInfo[ek.PartitionId], &ek)
// NOTE: unnecessary to set ext
log.LogWritef("GetAllExtsOfflineInode. mp[%v] ino(%v) deleteExtent(%v)", mpID, inode.Inode, ek.String())
return true
})
// NOTE: clear all extents in this layer
dIno.Extents = NewSortedExtents()
}
return
}
type InodeBatch []*Inode
type TxInode struct {
Inode *Inode
TxInfo *proto.TransactionInfo
}
func NewTxInode(ino uint64, t uint32, txInfo *proto.TransactionInfo) *TxInode {
ti := &TxInode{
Inode: NewInode(ino, t),
TxInfo: txInfo,
}
return ti
}
func (ti *TxInode) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0))
bs, err := ti.Inode.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
bs, err = ti.TxInfo.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
result = buff.Bytes()
return
}
func (ti *TxInode) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
var dataLen uint32
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
ino := NewInode(0, 0)
if err = ino.Unmarshal(data); err != nil {
return
}
ti.Inode = ino
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data = make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
if err = txInfo.Unmarshal(data); err != nil {
return
}
ti.TxInfo = txInfo
return
}
func (i *InodeBatch) Clone() InodeBatch {
var rB []*Inode
for _, inode := range []*Inode(*i) {
rB = append(rB, inode.Copy().(*Inode))
}
return rB
}
func (ino *Inode) getAllInodesInfo() (rsp []proto.InodeInfo) {
ino.RLock()
defer ino.RUnlock()
ino.RangeMultiVer(func(idx int, info *Inode) bool {
rspInodeInfo := &proto.InodeInfo{}
replyInfoNoCheck(rspInodeInfo, info)
rsp = append(rsp, *rspInodeInfo)
return true
})
return
}
func (ino *Inode) getAllLayerEks() (rsp []proto.LayerInfo) {
ino.RLock()
defer ino.RUnlock()
rspInodeInfo := &proto.InodeInfo{}
replyInfoNoCheck(rspInodeInfo, ino)
layerInfo := proto.LayerInfo{
LayerIdx: 0,
Info: rspInodeInfo,
Eks: ino.Extents.eks,
}
rsp = append(rsp, layerInfo)
ino.RangeMultiVer(func(idx int, info *Inode) bool {
rspInodeInfo := &proto.InodeInfo{}
replyInfo(rspInodeInfo, info, nil)
layerInfo := proto.LayerInfo{
LayerIdx: uint32(idx + 1),
Info: rspInodeInfo,
Eks: info.Extents.eks,
}
rsp = append(rsp, layerInfo)
return true
})
return
}
// String returns the string format of the inode.
func (i *Inode) String() string {
i.RLock()
defer i.RUnlock()
buff := bytes.NewBuffer(nil)
buff.Grow(128)
buff.WriteString("Inode{")
buff.WriteString(fmt.Sprintf("Inode[%d]", i.Inode))
buff.WriteString(fmt.Sprintf("Type[%d]", i.Type))
buff.WriteString(fmt.Sprintf("Uid[%d]", i.Uid))
buff.WriteString(fmt.Sprintf("Gid[%d]", i.Gid))
buff.WriteString(fmt.Sprintf("Size[%d]", i.Size))
buff.WriteString(fmt.Sprintf("Gen[%d]", i.Generation))
buff.WriteString(fmt.Sprintf("CT[%d]", i.CreateTime))
buff.WriteString(fmt.Sprintf("AT[%d]", i.AccessTime))
buff.WriteString(fmt.Sprintf("MT[%d]", i.ModifyTime))
buff.WriteString(fmt.Sprintf("LinkT[%s]", i.LinkTarget))
buff.WriteString(fmt.Sprintf("NLink[%d]", i.NLink))
buff.WriteString(fmt.Sprintf("Flag[%d]", i.Flag))
buff.WriteString(fmt.Sprintf("Reserved[%d]", i.Reserved))
buff.WriteString(fmt.Sprintf("Extents[%s]", i.Extents))
buff.WriteString(fmt.Sprintf("ObjExtents[%s]", i.ObjExtents))
buff.WriteString(fmt.Sprintf("verSeq[%v]", i.getVer()))
buff.WriteString(fmt.Sprintf("multiSnap.multiVersions.len[%v]", i.getLayerLen()))
buff.WriteString("}")
return buff.String()
}
// NewInode returns a new Inode instance with specified Inode ID, name and type.
// The AccessTime and ModifyTime will be set to the current time.
func NewInode(ino uint64, t uint32) *Inode {
ts := timeutil.GetCurrentTimeUnix()
i := &Inode{
Inode: ino,
Type: t,
Generation: 1,
CreateTime: ts,
AccessTime: ts,
ModifyTime: ts,
NLink: 1,
Extents: NewSortedExtents(),
ObjExtents: NewSortedObjExtents(),
multiSnap: nil,
}
if proto.IsDir(t) {
i.NLink = 2
}
return i
}
// Less tests whether the current Inode item is less than the given one.
// This method is necessary fot B-Tree item implementation.
func (i *Inode) Less(than BtreeItem) bool {
ino, ok := than.(*Inode)
return ok && i.Inode < ino.Inode
}
// Copy returns a copy of the inode.
func (i *Inode) Copy() BtreeItem {
newIno := NewInode(i.Inode, i.Type)
i.RLock()
newIno.Uid = i.Uid
newIno.Gid = i.Gid
newIno.Size = i.Size
newIno.Generation = i.Generation
newIno.CreateTime = i.CreateTime
newIno.ModifyTime = i.ModifyTime
newIno.AccessTime = i.AccessTime
if size := len(i.LinkTarget); size > 0 {
newIno.LinkTarget = make([]byte, size)
copy(newIno.LinkTarget, i.LinkTarget)
}
newIno.NLink = i.NLink
newIno.Flag = i.Flag
newIno.Reserved = i.Reserved
newIno.Extents = i.Extents.Clone()
newIno.ObjExtents = i.ObjExtents.Clone()
if i.multiSnap != nil {
newIno.multiSnap = &InodeMultiSnap{
verSeq: i.getVer(),
multiVersions: i.multiSnap.multiVersions.Clone(),
ekRefMap: i.multiSnap.ekRefMap,
}
}
i.RUnlock()
return newIno
}
func (i *Inode) CopyInodeOnly(cInode *Inode) *Inode {
tmpInode := cInode.CopyDirectly().(*Inode)
tmpInode.Extents = i.Extents
tmpInode.ObjExtents = i.ObjExtents
tmpInode.multiSnap = i.multiSnap
return tmpInode
}
func (i *Inode) CopyDirectly() BtreeItem {
newIno := NewInode(i.Inode, i.Type)
newIno.Uid = i.Uid
newIno.Gid = i.Gid
newIno.Size = i.Size
newIno.Generation = i.Generation
newIno.CreateTime = i.CreateTime
newIno.ModifyTime = i.ModifyTime
newIno.AccessTime = i.AccessTime
if size := len(i.LinkTarget); size > 0 {
newIno.LinkTarget = make([]byte, size)
copy(newIno.LinkTarget, i.LinkTarget)
}
newIno.NLink = i.NLink
newIno.Flag = i.Flag
newIno.Reserved = i.Reserved
newIno.Extents = i.Extents.Clone()
newIno.ObjExtents = i.ObjExtents.Clone()
return newIno
}
// MarshalToJSON is the wrapper of json.Marshal.
func (i *Inode) MarshalToJSON() ([]byte, error) {
i.RLock()
defer i.RUnlock()
return json.Marshal(i)
}
// Marshal marshals the inode into a byte array.
func (i *Inode) Marshal() (result []byte, err error) {
keyBytes := i.MarshalKey()
valBytes := i.MarshalValue()
keyLen := uint32(len(keyBytes))
valLen := uint32(len(valBytes))
buff := bytes.NewBuffer(make([]byte, 0, 128))
if err = binary.Write(buff, binary.BigEndian, keyLen); err != nil {
return
}
if _, err = buff.Write(keyBytes); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, valLen); err != nil {
return
}
if _, err = buff.Write(valBytes); err != nil {
return
}
result = buff.Bytes()
return
}
// Unmarshal unmarshals the inode.
func (i *Inode) Unmarshal(raw []byte) (err error) {
var (
keyLen uint32
valLen uint32
)
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &keyLen); err != nil {
return
}
keyBytes := make([]byte, keyLen)
if _, err = buff.Read(keyBytes); err != nil {
return
}
if err = i.UnmarshalKey(keyBytes); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &valLen); err != nil {
return
}
valBytes := make([]byte, valLen)
if _, err = buff.Read(valBytes); err != nil {
return
}
err = i.UnmarshalValue(valBytes)
return
}
// Marshal marshals the inodeBatch into a byte array.
func (i InodeBatch) Marshal() ([]byte, error) {
buff := bytes.NewBuffer(make([]byte, 0))
if err := binary.Write(buff, binary.BigEndian, uint32(len(i))); err != nil {
return nil, err
}
for _, inode := range i {
bs, err := inode.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
}
return buff.Bytes(), nil
}
// Unmarshal unmarshals the inodeBatch.
func InodeBatchUnmarshal(raw []byte) (InodeBatch, error) {
buff := bytes.NewBuffer(raw)
var batchLen uint32
if err := binary.Read(buff, binary.BigEndian, &batchLen); err != nil {
return nil, err
}
result := make(InodeBatch, 0, int(batchLen))
var dataLen uint32
for j := 0; j < int(batchLen); j++ {
if err := binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return nil, err
}
data := make([]byte, int(dataLen))
if _, err := buff.Read(data); err != nil {
return nil, err
}
ino := NewInode(0, 0)
if err := ino.Unmarshal(data); err != nil {
return nil, err
}
result = append(result, ino)
}
return result, nil
}
// MarshalKey marshals the exporterKey to bytes.
func (i *Inode) MarshalKey() (k []byte) {
k = make([]byte, 8)
binary.BigEndian.PutUint64(k, i.Inode)
return
}
// UnmarshalKey unmarshals the exporterKey from bytes.
func (i *Inode) UnmarshalKey(k []byte) (err error) {
i.Inode = binary.BigEndian.Uint64(k)
return
}
// MarshalValue marshals the value to bytes.
func (i *Inode) MarshalInodeValue(buff *bytes.Buffer) {
var err error
if err = binary.Write(buff, binary.BigEndian, &i.Type); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.Uid); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.Gid); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.Size); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.Generation); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.CreateTime); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.AccessTime); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.ModifyTime); err != nil {
panic(err)
}
// write SymLink
symSize := uint32(len(i.LinkTarget))
if err = binary.Write(buff, binary.BigEndian, &symSize); err != nil {
panic(err)
}
if _, err = buff.Write(i.LinkTarget); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.NLink); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &i.Flag); err != nil {
panic(err)
}
if i.ObjExtents != nil && len(i.ObjExtents.eks) > 0 {
i.Reserved |= V2EnableColdInodeFlag
}
i.Reserved |= V3EnableSnapInodeFlag
// log.LogInfof("action[MarshalInodeValue] inode[%v] Reserved %v", i.Inode, i.Reserved)
if err = binary.Write(buff, binary.BigEndian, &i.Reserved); err != nil {
panic(err)
}
// marshal ExtentsKey
extData, err := i.Extents.MarshalBinary(true)
if err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(extData))); err != nil {
panic(err)
}
if _, err = buff.Write(extData); err != nil {
panic(err)
}
if i.Reserved&V2EnableColdInodeFlag > 0 {
// marshal ObjExtentsKey
objExtData, err := i.ObjExtents.MarshalBinary()
if err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(objExtData))); err != nil {
panic(err)
}
if _, err = buff.Write(objExtData); err != nil {
panic(err)
}
}
if err = binary.Write(buff, binary.BigEndian, i.getVer()); err != nil {
panic(err)
}
return
}
// MarshalValue marshals the value to bytes.
func (i *Inode) MarshalValue() (val []byte) {
var err error
buff := bytes.NewBuffer(make([]byte, 0, 128))
buff.Grow(64)
i.RLock()
i.MarshalInodeValue(buff)
if i.getLayerLen() > 0 && i.getVer() == 0 {
log.LogFatalf("action[MarshalValue] inode[%v] current verseq [%v], hist len (%v) stack(%v)", i.Inode, i.getVer(), i.getLayerLen(), string(debug.Stack()))
}
if err = binary.Write(buff, binary.BigEndian, int32(i.getLayerLen())); err != nil {
i.RUnlock()
panic(err)
}
if i.multiSnap != nil {
for _, ino := range i.multiSnap.multiVersions {
ino.MarshalInodeValue(buff)
}
}
val = buff.Bytes()
i.RUnlock()
return
}
// UnmarshalValue unmarshals the value from bytes.
func (i *Inode) UnmarshalInodeValue(buff *bytes.Buffer) (err error) {
if err = binary.Read(buff, binary.BigEndian, &i.Type); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.Uid); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.Gid); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.Size); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.Generation); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.CreateTime); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.AccessTime); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.ModifyTime); err != nil {
return
}
// read symLink
symSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &symSize); err != nil {
return
}
if symSize > 0 {
i.LinkTarget = make([]byte, symSize)
if _, err = io.ReadFull(buff, i.LinkTarget); err != nil {
return
}
}
if err = binary.Read(buff, binary.BigEndian, &i.NLink); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.Flag); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &i.Reserved); err != nil {
return
}
// unmarshal ExtentsKey
if i.Extents == nil {
i.Extents = NewSortedExtents()
}
if i.ObjExtents == nil {
i.ObjExtents = NewSortedObjExtents()
}
v3 := i.Reserved&V3EnableSnapInodeFlag > 0
v2 := i.Reserved&V2EnableColdInodeFlag > 0
if v2 || v3 {
extSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &extSize); err != nil {
return
}
if extSize > 0 {
extBytes := make([]byte, extSize)
if _, err = io.ReadFull(buff, extBytes); err != nil {
return
}
var ekRef *sync.Map
if err, ekRef = i.Extents.UnmarshalBinary(extBytes, v3); err != nil {
return
}
// log.LogDebugf("inode[%v] ekRef %v", i.Inode, ekRef)
if ekRef != nil {
if i.multiSnap == nil {
i.multiSnap = NewMultiSnap(0)
}
// log.LogDebugf("inode[%v] ekRef %v", i.Inode, ekRef)
i.multiSnap.ekRefMap = ekRef
}
}
} else {
if err, _ = i.Extents.UnmarshalBinary(buff.Bytes(), false); err != nil {
return
}
return
}
if v2 {
// unmarshal ObjExtentsKey
ObjExtSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &ObjExtSize); err != nil {
return
}
if ObjExtSize > 0 {
objExtBytes := make([]byte, ObjExtSize)
if _, err = io.ReadFull(buff, objExtBytes); err != nil {
return
}
if err = i.ObjExtents.UnmarshalBinary(objExtBytes); err != nil {
return
}
}
}
if v3 {
var seq uint64
if err = binary.Read(buff, binary.BigEndian, &seq); err != nil {
return
}
if seq != 0 {
i.setVer(seq)
}
}
return
}
func (i *Inode) GetSpaceSize() (extSize uint64) {
if i.IsTempFile() {
return
}
extSize += i.Extents.LayerSize()
return
}
// UnmarshalValue unmarshals the value from bytes.
func (i *Inode) UnmarshalValue(val []byte) (err error) {
buff := bytes.NewBuffer(val)
i.UnmarshalInodeValue(buff)
if i.Reserved&V3EnableSnapInodeFlag > 0 {
var verCnt int32
if err = binary.Read(buff, binary.BigEndian, &verCnt); err != nil {
log.LogInfof("action[UnmarshalValue] err get ver cnt inode[%v] new seq [%v]", i.Inode, i.getVer())
return
}
if verCnt > 0 && i.getVer() == 0 {
err = fmt.Errorf("inode[%v] verCnt %v root ver [%v]", i.Inode, verCnt, i.getVer())
log.LogFatalf("UnmarshalValue. %v", err)
return
}
for idx := int32(0); idx < verCnt; idx++ {
ino := &Inode{Inode: i.Inode}
ino.UnmarshalInodeValue(buff)
if ino.multiSnap != nil && ino.multiSnap.ekRefMap != nil {
if i.multiSnap.ekRefMap == nil {
i.multiSnap.ekRefMap = new(sync.Map)
}
// log.LogDebugf("UnmarshalValue. inode[%v] merge top layer multiSnap.ekRefMap with layer %v", i.Inode, idx)
proto.MergeSplitKey(i.Inode, i.multiSnap.ekRefMap, ino.multiSnap.ekRefMap)
}
if i.multiSnap == nil {
i.multiSnap = &InodeMultiSnap{}
}
// log.LogDebugf("action[UnmarshalValue] inode[%v] old seq [%v] hist len %v", ino.Inode, ino.getVer(), i.getLayerLen())
i.multiSnap.multiVersions = append(i.multiSnap.multiVersions, ino)
}
}
return
}
// AppendExtents append the extent to the btree.
func (i *Inode) AppendExtents(eks []proto.ExtentKey, ct int64, volType int) (delExtents []proto.ExtentKey) {
if proto.IsCold(volType) {
return
}
i.Lock()
defer i.Unlock()
for _, ek := range eks {
delItems := i.Extents.Append(ek)
size := i.Extents.Size()
if i.Size < size {
i.Size = size
}
delExtents = append(delExtents, delItems...)
}
i.Generation++
i.ModifyTime = ct
return
}
// AppendObjExtents append the extent to the btree.
func (i *Inode) AppendObjExtents(eks []proto.ObjExtentKey, ct int64) (err error) {
i.Lock()
defer i.Unlock()
for _, ek := range eks {
err = i.ObjExtents.Append(ek)
if err != nil {
return
}
size := i.ObjExtents.Size()
if i.Size < size {
i.Size = size
}
}
i.Generation++
i.ModifyTime = ct
return
}
func (i *Inode) PrintAllVersionInfo() {
if i.multiSnap == nil {
return
}
log.LogInfof("action[PrintAllVersionInfo] inode[%v] verSeq [%v] hist len [%v]", i.Inode, i.getVer(), i.getLayerLen())
for id, info := range i.multiSnap.multiVersions {
log.LogInfof("action[PrintAllVersionInfo] layer [%v] verSeq [%v] inode[%v]", id, info.getVer(), info)
}
}
// clear snapshot extkey with releated verSeq
func (i *Inode) MultiLayerClearExtByVer(layer int, dVerSeq uint64) (delExtents []proto.ExtentKey) {
var ino *Inode
if layer == 0 {
ino = i
} else {
ino = i.multiSnap.multiVersions[layer-1]
}
ino.Extents.Lock()
defer ino.Extents.Unlock()
for idx, ek := range ino.Extents.eks {
if ek.GetSeq() > dVerSeq {
delExtents = append(delExtents, ek)
ino.Extents.eks = append(ino.Extents.eks[idx:], ino.Extents.eks[:idx+1]...)
}
}
return
}
func (i *Inode) mergeExtentArr(mpId uint64, extentKeysLeft []proto.ExtentKey, extentKeysRight []proto.ExtentKey) []proto.ExtentKey {
lCnt := len(extentKeysLeft)
rCnt := len(extentKeysRight)
sortMergedExts := make([]proto.ExtentKey, 0, lCnt+rCnt)
lPos, rPos := 0, 0
doWork := func(keyArr *[]proto.ExtentKey, pos int) {
mLen := len(sortMergedExts)
if mLen > 0 && sortMergedExts[mLen-1].IsSequenceWithSameSeq(&(*keyArr)[pos]) {
sortMergedExts[mLen-1].Size += (*keyArr)[pos].Size
log.LogDebugf("[mergeExtentArr] mpId[%v]. ek left %v right %v", mpId, sortMergedExts[mLen-1], (*keyArr)[pos])
if !sortMergedExts[mLen-1].IsSplit() || !(*keyArr)[pos].IsSplit() {
log.LogErrorf("[mergeExtentArr] mpId[%v] ino[%v] ek merge left %v right %v not all split", mpId, i.Inode, sortMergedExts[mLen-1], (*keyArr)[pos])
}
i.DecSplitEk(mpId, &(*keyArr)[pos])
} else {
sortMergedExts = append(sortMergedExts, (*keyArr)[pos])
}
}
for {
if lPos == lCnt {
sortMergedExts = append(sortMergedExts, extentKeysRight[rPos:]...)
break
}
if rPos == rCnt {
sortMergedExts = append(sortMergedExts, extentKeysLeft[lPos:]...)
break
}
if extentKeysLeft[lPos].FileOffset < extentKeysRight[rPos].FileOffset {
doWork(&extentKeysLeft, lPos)
lPos++
} else {
doWork(&extentKeysRight, rPos)
rPos++
}
}
return sortMergedExts
}
// Restore ext info to older version or deleted if no right version
// The list(multiSnap.multiVersions) contains all point of modification on inode, each ext must belong to one layer.
// Once the layer be deleted is top layer ver be changed to upper layer, or else the ext belongs is exclusive and can be dropped
func (i *Inode) RestoreExts2NextLayer(mpId uint64, delExtentsOrigin []proto.ExtentKey, curVer uint64, idx int) (delExtents []proto.ExtentKey, err error) {
log.LogInfof("action[RestoreMultiSnapExts] mpId [%v] curVer [%v] delExtents size [%v] hist len [%v]", mpId, curVer, len(delExtentsOrigin), i.getLayerLen())
// no version left.all old versions be deleted
if i.isEmptyVerList() {
log.LogWarnf("action[RestoreMultiSnapExts] mpId [%v] inode[%v] restore have no old version left", mpId, i.Inode)
return delExtentsOrigin, nil
}
lastSeq := i.multiSnap.multiVersions[idx].getVer()
specSnapExtent := make([]proto.ExtentKey, 0)
for _, delExt := range delExtentsOrigin {
// curr deleting delExt with a seq larger than the next version's seq, it doesn't belong to any
// versions,so try to delete it
log.LogDebugf("action[RestoreMultiSnapExts] mpId [%v] inode[%v] ext split [%v] with seq[%v] gSeq[%v] try to del.the last seq [%v], ek details[%v]",
mpId, i.Inode, delExt.IsSplit(), delExt.GetSeq(), curVer, lastSeq, delExt)
if delExt.GetSeq() > lastSeq {
delExtents = append(delExtents, delExt)
} else {
log.LogInfof("action[RestoreMultiSnapExts] mpId [%v] inode[%v] move to level 1 delExt [%v] specSnapExtent size [%v]", mpId, i.Inode, delExt, len(specSnapExtent))
specSnapExtent = append(specSnapExtent, delExt)
}
}
if len(specSnapExtent) == 0 {
log.LogInfof("action[RestoreMultiSnapExts] mpId [%v] inode[%v] no need to move to level 1", mpId, i.Inode)
return
}
if len(specSnapExtent) > 0 && i.isEmptyVerList() {
err = fmt.Errorf("mpId [%v] inode[%v] error not found prev snapshot index", mpId, i.Inode)
log.LogErrorf("action[RestoreMultiSnapExts] mpId [%v] inode[%v] %v", mpId, i.Inode, err)
return
}
i.multiSnap.multiVersions[idx].Extents.Lock()
i.multiSnap.multiVersions[idx].Extents.eks = i.mergeExtentArr(mpId, i.multiSnap.multiVersions[idx].Extents.eks, specSnapExtent)
i.multiSnap.multiVersions[idx].Extents.Unlock()
return
}
func (inode *Inode) unlinkTopLayer(mpId uint64, ino *Inode, mpVer uint64, verlist *proto.VolVersionInfoList) (ext2Del []proto.ExtentKey, doMore bool, status uint8) {
// if there's no snapshot itself, nor have snapshot after inode's ver then need unlink directly and make no snapshot
// just move to upper layer, the behavior looks like that the snapshot be dropped
log.LogDebugf("action[unlinkTopLayer] mpid [%v] mpver [%v] check if have snapshot depends on the deleitng ino[%v] (with no snapshot itself) found seq [%v], verlist %v",
mpId, mpVer, ino, inode.getVer(), verlist)
status = proto.OpOk
delFunc := func() (done bool) {
if inode.NLink > 1 {
log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, file link is %v", ino.Inode, inode.NLink)
inode.DecNLink()
doMore = false
return true
}
// first layer need delete
var err error
if ext2Del, err = inode.RestoreExts2NextLayer(mpId, inode.Extents.eks, mpVer, 0); err != nil {
log.LogErrorf("action[getAndDelVerInList] ino[%v] RestoreMultiSnapExts split error %v", inode.Inode, err)
status = proto.OpNotExistErr
log.LogDebugf("action[unlinkTopLayer] mp[%v] iino[%v]", mpId, ino)
return
}
inode.Extents.eks = inode.Extents.eks[:0]
log.LogDebugf("action[getAndDelVerInList] mp[%v] ino[%v] verseq [%v] get del exts %v", mpId, inode.Inode, inode.getVer(), ext2Del)
inode.DecNLink() // dIno should be inode
doMore = true
return
}
// if topLayer verSeq is as same as mp, the current inode deletion only happen on the first layer
// or ddelete from client do deletion at top layer which should allow delete ionde with older version contrast to mp version
// because ddelete have two steps,1 is del dentry,2nd is unlink inode ,version may updated after 1st and before 2nd step, to
// make sure inode be unlinked by normal deletion, sdk add filed of dentry verSeq to identify and different from other unlink actions
if mpVer == inode.getVer() {
if inode.getLayerLen() == 0 {
log.LogDebugf("action[unlinkTopLayer] no snapshot available depends on ino[%v] not found seq [%v] and return, verlist %v", ino, inode.getVer(), verlist)
inode.DecNLink()
log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked", ino.Inode)
// operate inode directly
doMore = true
return
}
log.LogDebugf("action[unlinkTopLayer] need restore.ino[%v] withseq [%v] equal mp seq, verlist %v",
ino, inode.getVer(), verlist)
// need restore
if !proto.IsDir(inode.Type) {
delFunc()
return
}
log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, Dir", ino.Inode)
inode.DecNLink()
doMore = true
return
}
log.LogDebugf("action[unlinkTopLayer] need create version.ino[%v] withseq [%v] not equal mp seq [%v], verlist %v", ino, inode.getVer(), mpVer, verlist)
if proto.IsDir(inode.Type) { // dir is whole info but inode is partition,which is quit different
_, err := verlist.GetNextOlderVer(mpVer)
if err == nil {
log.LogDebugf("action[unlinkTopLayer] inode[%v] cann't get next older ver [%v] err %v", inode.Inode, mpVer, err)
inode.CreateVer(mpVer)
}
inode.DecNLink()
log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, Dir create ver 1st layer", ino.Inode)
doMore = true
} else {
ver, err := verlist.GetNextOlderVer(mpVer)
if err != nil {
if err.Error() == "not found" {
delFunc()
return
}
log.LogErrorf("action[unlinkTopLayer] inode[%v] cann't get next older ver [%v] err %v", inode.Inode, mpVer, err)
return
}
inode.CreateVer(mpVer) // protect origin version
if inode.NLink == 1 {
inode.CreateUnlinkVer(mpVer, ver) // create a effective top level version
}
inode.DecNLink()
log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, File create ver 1st layer", ino.Inode)
}
return
}
func (inode *Inode) dirUnlinkVerInlist(ino *Inode, mpVer uint64, verlist *proto.VolVersionInfoList) (ext2Del []proto.ExtentKey, doMore bool, status uint8) {
var idxWithTopLayer int
var dIno *Inode
status = proto.OpOk
if dIno, idxWithTopLayer = inode.getInoByVer(ino.getVer(), false); dIno == nil {
log.LogDebugf("action[dirUnlinkVerInlist] ino[%v] not found", ino)
return
}
var endSeq uint64
if idxWithTopLayer == 0 {
// header layer do nothing and be depends on should not be dropped
log.LogDebugf("action[dirUnlinkVerInlist] ino[%v] first layer do nothing", ino)
return
}
// if any alive snapshot in mp dimension exist in seq scope from dino to next ascend neighbor, dio snapshot be keep or else drop
if inode.multiSnap == nil {
log.LogWarnf("action[dirUnlinkVerInlist] ino[%v] multiSnap should not be nil", inode)
inode.multiSnap = &InodeMultiSnap{}
}
mIdx := idxWithTopLayer - 1
if mIdx == 0 {
endSeq = inode.getVer()
} else {
endSeq = inode.multiSnap.multiVersions[mIdx-1].getVer()
}
log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] try drop multiVersion idx %v effective seq scope [%v,%v) ",
inode.Inode, mIdx, dIno.getVer(), endSeq)
doWork := func() bool {
verlist.RWLock.RLock()
defer verlist.RWLock.RUnlock()
for vidx, info := range verlist.VerList {
if info.Ver >= dIno.getVer() && info.Ver < endSeq {
log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] dir layer idx %v still have effective snapshot seq [%v].so don't drop", inode.Inode, mIdx, info.Ver)
return false
}
if info.Ver >= endSeq || vidx == len(verlist.VerList)-1 {
log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] try drop multiVersion idx %v and return", inode.Inode, mIdx)
inode.Lock()
inode.multiSnap.multiVersions = append(inode.multiSnap.multiVersions[:mIdx], inode.multiSnap.multiVersions[mIdx+1:]...)
inode.Unlock()
return true
}
log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] try drop scope [%v, %v), mp ver [%v] not suitable", inode.Inode, dIno.getVer(), endSeq, info.Ver)
return true
}
return true
}
if !doWork() {
return
}
doMore = true
dIno.DecNLink()
return
}
func (inode *Inode) unlinkVerInList(mpId uint64, ino *Inode, mpVer uint64, verlist *proto.VolVersionInfoList) (ext2Del []proto.ExtentKey, doMore bool, status uint8) {
log.LogDebugf("action[unlinkVerInList] mpId [%v] ino[%v] try search seq [%v] isdir %v", mpId, ino, ino.getVer(), proto.IsDir(inode.Type))
if proto.IsDir(inode.Type) { // snapshot dir deletion don't take link into consider, but considers the scope of snapshot contrast to verList
return inode.dirUnlinkVerInlist(ino, mpVer, verlist)
}
var dIno *Inode
status = proto.OpOk
// special case, snapshot is the last one and be depended by upper version,update it's version to the right one
// ascend search util to the curr unCommit version in the verList
if ino.getVer() == inode.getVer() || (isInitSnapVer(ino.getVer()) && inode.getVer() == 0) {
if len(verlist.VerList) == 0 {
status = proto.OpNotExistErr
log.LogErrorf("action[unlinkVerInList] inode[%v] verlist should be larger than 0, return not found", inode.Inode)
return
}
// just move to upper layer,the request snapshot be dropped
nVerSeq, found := inode.getLastestVer(inode.getVer(), verlist)
if !found {
status = proto.OpNotExistErr
return
}
log.LogDebugf("action[unlinkVerInList] inode[%v] update current verseq [%v] to %v", inode.Inode, inode.getVer(), nVerSeq)
inode.setVer(nVerSeq)
return
} else {
// don't unlink if no version satisfied
if ext2Del, dIno = inode.getAndDelVerInList(mpId, ino.getVer(), mpVer, verlist); dIno == nil {
status = proto.OpNotExistErr
log.LogDebugf("action[unlinkVerInList] ino[%v]", ino)
return
}
}
dIno.DecNLink()
log.LogDebugf("action[unlinkVerInList] inode[%v] snapshot layer be unlinked", ino.Inode)
doMore = true
return
}
func (i *Inode) ShouldDelVer(delVer uint64, mpVer uint64) (ok bool, err error) {
if i.getVer() == 0 {
if delVer > 0 {
if isInitSnapVer(delVer) {
return true, nil
}
return false, fmt.Errorf("not found")
} else {
// mp ver larger than zero means snapshot happened but haven't take effect on this inode
if mpVer > 0 {
return false, nil
}
return true, nil
}
} else {
if delVer > i.getVer() {
return false, fmt.Errorf("not found")
} else if delVer == i.getVer() {
return true, nil
}
}
if isInitSnapVer(delVer) {
tailVer, _ := i.getTailVerInList()
if tailVer == 0 {
return true, nil
}
return false, fmt.Errorf("not found")
}
if i.multiSnap == nil {
return false, fmt.Errorf("not found")
}
for _, inoVer := range i.multiSnap.multiVersions {
if inoVer.getVer() == delVer {
return true, nil
}
if inoVer.getVer() < delVer {
break
}
}
return false, fmt.Errorf("not found")
}
// idx need calc include nclude top layer. index in multiSnap.multiVersions need add by 1
//
//note:search all layers.
func (ino *Inode) getInoByVer(verSeq uint64, equal bool) (i *Inode, idx int) {
ino.RLock()
defer ino.RUnlock()
if verSeq == 0 || verSeq == ino.getVer() || (isInitSnapVer(verSeq) && ino.getVer() == 0) {
return ino, 0
}
if isInitSnapVer(verSeq) {
listLen := ino.getLayerLen()
if listLen == 0 {
log.LogDebugf("action[getInoByVer] ino[%v] no multiversion", ino.Inode)
return
}
i = ino.multiSnap.multiVersions[listLen-1]
if i.getVer() != 0 {
log.LogDebugf("action[getInoByVer] ino[%v] lay seq [%v]", ino.Inode, i.getVer())
return nil, 0
}
return i, listLen
}
if verSeq > 0 && ino.getVer() > verSeq {
if ino.multiSnap != nil {
for id, iTmp := range ino.multiSnap.multiVersions {
if verSeq == iTmp.getVer() {
log.LogDebugf("action[getInoByVer] ino[%v] get in multiversion id[%v]", ino.Inode, id)
return iTmp, id + 1
} else if verSeq > iTmp.getVer() {
if !equal {
log.LogDebugf("action[getInoByVer] ino[%v] get in multiversion id[%v], %v, %v", ino.Inode, id, verSeq, iTmp.getVer())
return iTmp, id + 1
}
log.LogDebugf("action[getInoByVer] ino[%v] get in multiversion id[%v]", ino.Inode, id)
return
}
}
}
} else {
if !equal {
log.LogDebugf("action[getInoByVer] ino[%v]", ino.Inode)
return ino, 0
}
}
return
}
// 1. check if dVer layer is the last layer of the system 1)true,drop it all 2) false goto 3
// 2. if have system layer between dVer and next older inode's layer(not exist is ok), drop dVer related exts and update ver
// 3. else Restore to next inode's Layer
func (i *Inode) getAndDelVerInList(mpId uint64, dVer uint64, mpVer uint64, verlist *proto.VolVersionInfoList) (delExtents []proto.ExtentKey, ino *Inode) {
var err error
verlist.RWLock.RLock()
defer verlist.RWLock.RUnlock()
log.LogDebugf("action[getAndDelVerInList] ino[%v] verseq [%v] request del ver [%v] hist len %v isTmpFile %v",
i.Inode, i.getVer(), dVer, i.getLayerLen(), i.IsTempFile())
// read inode element is fine, lock is need while write
inoVerLen := i.getLayerLen()
if inoVerLen == 0 {
log.LogDebugf("action[getAndDelVerInList] ino[%v] RestoreMultiSnapExts no left", i.Inode)
return
}
// delete snapshot version
if isInitSnapVer(dVer) {
dVer = 0
}
lastVer := i.getVer()
for id, mIno := range i.multiSnap.multiVersions {
log.LogDebugf("action[getAndDelVerInList] ino[%v] multiSnap.multiVersions level %v verseq [%v]", i.Inode, id, mIno.getVer())
if mIno.getVer() < dVer {
log.LogDebugf("action[getAndDelVerInList] ino[%v] multiSnap.multiVersions level %v verseq [%v]", i.Inode, id, mIno.getVer())
return
}
if mIno.getVer() == dVer {
log.LogDebugf("action[getAndDelVerInList] ino[%v] ver [%v] step 3", i.Inode, mIno.getVer())
// 2. get next version should according to verList but not only self multi list
var nVerSeq uint64
if nVerSeq, err = verlist.GetNextNewerVer(dVer); err != nil {
log.LogDebugf("action[getAndDelVerInList] get next version failed, err %v", err)
return
}
if lastVer > nVerSeq {
mIno.setVer(nVerSeq)
return
}
if i.isTailIndexInList(id) {
i.multiSnap.multiVersions = i.multiSnap.multiVersions[:inoVerLen-1]
log.LogDebugf("action[getAndDelVerInList] ino[%v] idx %v be dropped", i.Inode, inoVerLen)
return mIno.Extents.eks, mIno
}
if nVerSeq, err = verlist.GetNextOlderVer(dVer); err != nil {
log.LogDebugf("action[getAndDelVerInList] get next version failed, err %v", err)
return
}
log.LogDebugf("action[getAndDelVerInList] ino[%v] ver [%v] nextVerseq [%v] step 3 ver ", i.Inode, mIno.getVer(), nVerSeq)
// 2. system next layer not exist in inode ver list. update curr layer to next layer and filter out ek with verSeq
// change id layer verSeq to neighbor layer info, omit version delete process
if nVerSeq > i.multiSnap.multiVersions[id+1].getVer() {
log.LogDebugf("action[getAndDelVerInList] ino[%v] get next version in verList update ver from %v to %v.And delete exts with ver [%v]",
i.Inode, i.multiSnap.multiVersions[id].getVer(), nVerSeq, dVer)
i.multiSnap.multiVersions[id].setVerNoCheck(nVerSeq)
i.multiSnap.multiVersions[id] = i.CopyInodeOnly(i.multiSnap.multiVersions[id+1])
delExtents = i.MultiLayerClearExtByVer(id+1, dVer)
ino = i.multiSnap.multiVersions[id]
if len(i.multiSnap.multiVersions[id].Extents.eks) != 0 {
log.LogDebugf("action[getAndDelVerInList] ino[%v] after clear self still have ext and left", i.Inode)
return
}
} else {
log.LogDebugf("action[getAndDelVerInList] ino[%v] ver [%v] nextver [%v] step 3 ver ", i.Inode, mIno.getVer(), nVerSeq)
// 3. next layer exist. the deleted version and next version are neighbor in verlist, thus need restore and delete
if delExtents, err = i.RestoreExts2NextLayer(mpId, mIno.Extents.eks, dVer, id+1); err != nil {
log.LogDebugf("action[getAndDelVerInList] ino[%v] RestoreMultiSnapExts split error %v", i.Inode, err)
return
}
}
// delete layer id
i.multiSnap.multiVersions = append(i.multiSnap.multiVersions[:id], i.multiSnap.multiVersions[id+1:]...)
log.LogDebugf("action[getAndDelVerInList] ino[%v] verseq [%v] get del exts %v", i.Inode, i.getVer(), delExtents)
return delExtents, mIno
}
lastVer = mIno.getVer()
}
return
}
func (i *Inode) getLastestVer(reqVerSeq uint64, verlist *proto.VolVersionInfoList) (uint64, bool) {
verlist.RWLock.RLock()
defer verlist.RWLock.RUnlock()
if len(verlist.VerList) == 0 {
return 0, false
}
for _, info := range verlist.VerList {
if info.Ver > reqVerSeq {
return info.Ver, true
}
}
log.LogDebugf("action[getLastestVer] inode[%v] reqVerseq [%v] not found, the largetst one %v",
i.Inode, reqVerSeq, verlist.VerList[len(verlist.VerList)-1].Ver)
return 0, false
}
func (i *Inode) CreateUnlinkVer(mpVer uint64, nVer uint64) {
log.LogDebugf("action[CreateUnlinkVer] inode[%v] mpver [%v] nver [%v]", i.Inode, mpVer, nVer)
// inode copy not include multi ver array
ino := i.CopyDirectly().(*Inode)
ino.setVer(nVer)
i.Extents = NewSortedExtents()
i.ObjExtents = NewSortedObjExtents()
i.SetDeleteMark()
log.LogDebugf("action[CreateUnlinkVer] inode[%v] create new version [%v] and store old one [%v], hist len [%v]",
i.Inode, mpVer, i.getVer(), i.getLayerLen())
i.Lock()
if i.multiSnap == nil {
i.multiSnap = &InodeMultiSnap{}
}
if i.getLayerVer(0) == nVer {
i.multiSnap.multiVersions[0] = ino
} else {
i.multiSnap.multiVersions = append([]*Inode{ino}, i.multiSnap.multiVersions...)
}
i.setVer(mpVer)
i.Unlock()
}
func (i *Inode) CreateVer(ver uint64) {
// inode copy not include multi ver array
ino := i.CopyDirectly().(*Inode)
ino.Extents = NewSortedExtents()
ino.ObjExtents = NewSortedObjExtents()
ino.setVer(i.getVer())
i.setVer(ver)
i.Lock()
defer i.Unlock()
log.LogDebugf("action[CreateVer] inode[%v] create new version [%v] and store old one [%v], hist len [%v]",
i.Inode, ver, i.getVer(), i.getLayerLen())
if i.multiSnap == nil {
i.multiSnap = &InodeMultiSnap{}
}
i.multiSnap.multiVersions = append([]*Inode{ino}, i.multiSnap.multiVersions...)
}
func (i *Inode) buildMultiSnap() {
if i.multiSnap == nil {
i.multiSnap = &InodeMultiSnap{}
}
if i.multiSnap.ekRefMap == nil {
i.multiSnap.ekRefMap = new(sync.Map)
}
}
func (i *Inode) SplitExtentWithCheck(param *AppendExtParam) (delExtents []proto.ExtentKey, status uint8) {
var err error
param.ek.SetSeq(param.mpVer)
log.LogDebugf("action[SplitExtentWithCheck] mpId[%v].inode[%v],ek [%v],hist len %v", param.mpId, i.Inode, param.ek, i.getLayerLen())
if param.mpVer != i.getVer() {
log.LogDebugf("action[SplitExtentWithCheck] mpId[%v].CreateVer ver [%v]", param.mpId, param.mpVer)
i.CreateVer(param.mpVer)
}
i.Lock()
defer i.Unlock()
i.buildMultiSnap()
delExtents, status = i.Extents.SplitWithCheck(param.mpId, i.Inode, param.ek, i.multiSnap.ekRefMap)
if status != proto.OpOk {
log.LogErrorf("action[SplitExtentWithCheck] mpId[%v].status [%v]", param.mpId, status)
return
}
if len(delExtents) == 0 {
return
}
if err = i.CreateLowerVersion(i.getVer(), param.multiVersionList); err != nil {
return
}
if delExtents, err = i.RestoreExts2NextLayer(param.mpId, delExtents, param.mpVer, 0); err != nil {
log.LogErrorf("action[fsmAppendExtentWithCheck] mpId[%v].ino[%v] RestoreMultiSnapExts split error %v", param.mpId, i.Inode, err)
return
}
if proto.IsHot(param.volType) {
i.Generation++
i.ModifyTime = param.ct
}
return
}
// try to create version between curVer and seq of multiSnap.multiVersions[0] in verList
func (i *Inode) CreateLowerVersion(curVer uint64, verlist *proto.VolVersionInfoList) (err error) {
verlist.RWLock.RLock()
defer verlist.RWLock.RUnlock()
log.LogDebugf("CreateLowerVersion inode[%v] curver [%v]", i.Inode, curVer)
if len(verlist.VerList) <= 1 {
return
}
if i.isEmptyVerList() {
return
}
var nextVer uint64
for _, info := range verlist.VerList {
if info.Ver < curVer {
nextVer = info.Ver
}
if info.Ver >= curVer {
break
}
}
if nextVer <= i.getLayerVer(0) {
log.LogDebugf("CreateLowerVersion nextver [%v] layer 0 ver [%v]", nextVer, i.getLayerVer(0))
return
}
ino := i.CopyDirectly().(*Inode)
ino.Extents = NewSortedExtents()
ino.ObjExtents = NewSortedObjExtents()
ino.setVer(nextVer)
log.LogDebugf("action[CreateLowerVersion] inode[%v] create new version [%v] and store old one [%v], hist len [%v]",
i.Inode, ino, i.getVer(), i.getLayerLen())
if i.multiSnap == nil {
i.multiSnap = &InodeMultiSnap{}
}
i.multiSnap.multiVersions = append([]*Inode{ino}, i.multiSnap.multiVersions...)
return
}
type AppendExtParam struct {
mpId uint64
mpVer uint64
multiVersionList *proto.VolVersionInfoList
ek proto.ExtentKey
ct int64
discardExtents []proto.ExtentKey
volType int
}
func (i *Inode) AppendExtentWithCheck(param *AppendExtParam) (delExtents []proto.ExtentKey, status uint8) {
param.ek.SetSeq(param.mpVer)
log.LogDebugf("action[AppendExtentWithCheck] mpId[%v].mpver [%v] inode[%v] and fsm ver [%v],ek [%v],hist len %v",
param.mpId, param.mpVer, i.Inode, i.getVer(), param.ek, i.getLayerLen())
if param.mpVer != i.getVer() {
log.LogInfof("action[AppendExtentWithCheck] mpId[%v].inode ver [%v]", param.mpId, i.getVer())
i.CreateVer(param.mpVer)
}
i.Lock()
defer i.Unlock()
refFunc := func(key *proto.ExtentKey) { i.insertEkRefMap(param.mpId, key) }
delExtents, status = i.Extents.AppendWithCheck(i.Inode, param.ek, refFunc, param.discardExtents)
if status != proto.OpOk {
log.LogErrorf("action[AppendExtentWithCheck] mpId[%v].status [%v]", param.mpId, status)
return
}
// multi version take effect
if i.getVer() > 0 && len(delExtents) > 0 {
var err error
if err = i.CreateLowerVersion(i.getVer(), param.multiVersionList); err != nil {
return
}
if delExtents, err = i.RestoreExts2NextLayer(param.mpId, delExtents, param.mpVer, 0); err != nil {
log.LogErrorf("action[AppendExtentWithCheck] mpId[%v].RestoreMultiSnapExts err %v", param.mpId, err)
return nil, proto.OpErr
}
}
if proto.IsHot(param.volType) {
size := i.Extents.Size()
if i.Size < size {
i.Size = size
}
i.Generation++
i.ModifyTime = param.ct
}
return
}
func (i *Inode) ExtentsTruncate(length uint64, ct int64, doOnLastKey func(*proto.ExtentKey), insertRefMap func(ek *proto.ExtentKey)) (delExtents []proto.ExtentKey) {
delExtents = i.Extents.Truncate(length, doOnLastKey, insertRefMap)
i.Size = length
i.ModifyTime = ct
i.Generation++
return
}
// IncNLink increases the nLink value by one.
func (i *Inode) IncNLink(verSeq uint64) {
if i.getVer() < verSeq {
i.CreateVer(verSeq)
}
i.Lock()
i.NLink++
i.Unlock()
}
// DecNLink decreases the nLink value by one.
func (i *Inode) DecNLink() {
i.Lock()
if proto.IsDir(i.Type) && i.NLink == 2 {
i.NLink--
}
if i.NLink > 0 {
i.NLink--
}
i.Unlock()
}
// DecNLink decreases the nLink value by one.
func (i *Inode) DecNLinkByVer(verSeq uint64) {
if i.getVer() < verSeq {
i.CreateVer(verSeq)
}
i.DecNLink()
}
func (i *Inode) DecSplitExts(mpId uint64, delExtents interface{}) {
log.LogDebugf("[DecSplitExts] mpId [%v] inode[%v]", mpId, i.Inode)
cnt := len(delExtents.([]proto.ExtentKey))
for id := 0; id < cnt; id++ {
ek := &delExtents.([]proto.ExtentKey)[id]
if !ek.IsSplit() {
log.LogDebugf("[DecSplitExts] mpId [%v] ek not split %v", mpId, ek)
continue
}
if i.multiSnap == nil || i.multiSnap.ekRefMap == nil {
log.LogErrorf("[DecSplitExts] mpid [%v]. inode[%v] multiSnap.ekRefMap is nil", mpId, i.Inode)
return
}
ok, last := i.DecSplitEk(mpId, ek)
if !ok {
log.LogErrorf("[DecSplitExts] mpid [%v]. ek [%v] not found!", mpId, ek)
continue
}
if last {
log.LogDebugf("[DecSplitExts] mpid [%v] ek [%v] split flag be unset to remove all content", mpId, ek)
ek.SetSplit(false)
}
}
}
func (i *Inode) DecSplitEk(mpId uint64, ext *proto.ExtentKey) (ok bool, last bool) {
log.LogDebugf("[DecSplitEk] mpId[%v] inode[%v] dp [%v] extent id[%v].key %v ext %v", mpId, i.Inode, ext.PartitionId, ext.ExtentId,
ext.PartitionId<<32|ext.ExtentId, ext)
if i.multiSnap == nil || i.multiSnap.ekRefMap == nil {
log.LogErrorf("DecSplitEk. multiSnap %v", i.multiSnap)
return
}
if val, ok := i.multiSnap.ekRefMap.Load(ext.PartitionId<<32 | ext.ExtentId); !ok {
log.LogErrorf("[DecSplitEk] mpId[%v]. dp [%v] inode[%v] ext not found", mpId, ext.PartitionId, i.Inode)
return false, false
} else {
if val.(uint32) == 0 {
log.LogErrorf("[DecSplitEk] mpId[%v]. dp [%v] inode[%v] ek ref is zero!", mpId, ext.PartitionId, i.Inode)
return false, false
}
if val.(uint32) == 1 {
log.LogDebugf("[DecSplitEk] mpId[%v] inode[%v] dp [%v] extent id[%v].key %v", mpId, i.Inode, ext.PartitionId, ext.ExtentId,
ext.PartitionId<<32|ext.ExtentId)
i.multiSnap.ekRefMap.Delete(ext.PartitionId<<32 | ext.ExtentId)
return true, true
}
i.multiSnap.ekRefMap.Store(ext.PartitionId<<32|ext.ExtentId, val.(uint32)-1)
log.LogDebugf("[DecSplitEk] mpId[%v]. extend dp [%v] inode[%v] ek [%v] val %v", mpId, ext.PartitionId, i.Inode, ext, val.(uint32)-1)
return true, false
}
}
// DecNLink decreases the nLink value by one.
func (i *Inode) GetDecNLinkResult() (nLink uint32) {
i.Lock()
nLink = i.NLink
if proto.IsDir(i.Type) && nLink == 2 {
nLink--
}
if nLink > 0 {
nLink--
}
i.Unlock()
return
}
// GetNLink returns the nLink value.
func (i *Inode) GetNLink() uint32 {
i.RLock()
defer i.RUnlock()
return i.NLink
}
func (i *Inode) IsTempFile() bool {
i.RLock()
ok := i.NLink == 0 && !proto.IsDir(i.Type)
i.RUnlock()
return ok
}
func (i *Inode) IsEmptyDir() bool {
i.RLock()
ok := proto.IsDir(i.Type) && i.NLink <= 2
i.RUnlock()
return ok
}
func (i *Inode) IsEmptyDirAndNoSnapshot() bool {
i.RLock()
ok := proto.IsDir(i.Type) && i.NLink <= 2 && i.getLayerLen() == 0
i.RUnlock()
return ok
}
func (i *Inode) IsTopLayerEmptyDir() bool {
i.RLock()
ok := proto.IsDir(i.Type) && i.NLink <= 2
i.RUnlock()
return ok
}
// SetDeleteMark set the deleteMark flag. TODO markDelete or deleteMark? markDelete has been used in datanode.
func (i *Inode) SetDeleteMark() {
i.Lock()
i.Flag |= DeleteMarkFlag
i.Unlock()
}
// ShouldDelete returns if the inode has been marked as deleted.
func (i *Inode) ShouldDelete() (ok bool) {
i.RLock()
ok = i.Flag&DeleteMarkFlag == DeleteMarkFlag
i.RUnlock()
return
}
// inode should delay remove if as 3 conditions:
// 1. DeleteMarkFlag is unset
// 2. NLink == 0
// 3. AccessTime is 7 days ago
func (i *Inode) ShouldDelayDelete() (ok bool) {
i.RLock()
ok = (i.Flag&DeleteMarkFlag != DeleteMarkFlag) &&
(i.NLink == 0) &&
time.Now().Unix()-i.AccessTime < InodeNLink0DelayDeleteSeconds
i.RUnlock()
return
}
// SetAttr sets the attributes of the inode.
func (i *Inode) SetAttr(req *SetattrRequest) {
log.LogDebugf("action[SetAttr] inode[%v] req seq [%v] inode seq [%v]", i.Inode, req.VerSeq, i.getVer())
if req.VerSeq != i.getVer() {
i.CreateVer(req.VerSeq)
}
i.Lock()
log.LogDebugf("action[SetAttr] inode[%v] req seq [%v] inode seq [%v]", i.Inode, req.VerSeq, i.getVer())
if req.Valid&proto.AttrMode != 0 {
i.Type = req.Mode
}
if req.Valid&proto.AttrUid != 0 {
i.Uid = req.Uid
}
if req.Valid&proto.AttrGid != 0 {
i.Gid = req.Gid
}
if req.Valid&proto.AttrAccessTime != 0 {
i.AccessTime = req.AccessTime
}
if req.Valid&proto.AttrModifyTime != 0 {
i.ModifyTime = req.ModifyTime
}
i.Unlock()
}
func (i *Inode) DoWriteFunc(fn func()) {
i.Lock()
defer i.Unlock()
fn()
}
// DoFunc executes the given function.
func (i *Inode) DoReadFunc(fn func()) {
i.RLock()
defer i.RUnlock()
fn()
}
// SetMtime sets mtime to the current time.
func (i *Inode) SetMtime() {
mtime := timeutil.GetCurrentTimeUnix()
i.Lock()
defer i.Unlock()
i.ModifyTime = mtime
}
// EmptyExtents clean the inode's extent list.
func (i *Inode) EmptyExtents(mtime int64) (delExtents []proto.ExtentKey) {
i.Lock()
defer i.Unlock()
// eks is safe because extents be reset next and eks is will not be visit except del routine
delExtents = i.Extents.eks
i.Extents = NewSortedExtents()
return delExtents
}
// EmptyExtents clean the inode's extent list.
func (i *Inode) CopyTinyExtents() (delExtents []proto.ExtentKey) {
i.RLock()
defer i.RUnlock()
return i.Extents.CopyTinyExtents()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"fmt"
syslog "log"
"net"
"os"
"path"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/cmd/common"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/atomicutil"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/loadutil"
"github.com/cubefs/cubefs/util/log"
)
const (
partitionPrefix = "partition_"
ExpiredPartitionPrefix = "expired_"
)
const sampleDuration = 1 * time.Second
// MetadataManager manages all the meta partitions.
type MetadataManager interface {
Start() error
Stop()
// CreatePartition(id string, start, end uint64, peers []proto.Peer) error
HandleMetadataOperation(conn net.Conn, p *Packet, remoteAddr string) error
GetPartition(id uint64) (MetaPartition, error)
GetLeaderPartitions() map[uint64]MetaPartition
checkVolVerList() (err error)
}
// MetadataManagerConfig defines the configures in the metadata manager.
type MetadataManagerConfig struct {
NodeID uint64
RootDir string
ZoneName string
RaftStore raftstore.RaftStore
}
type verOp2Phase struct {
verSeq uint64
verPrepare uint64
status uint32
step uint32
isActiveReqToMaster bool
sync.Mutex
}
type metadataManager struct {
nodeId uint64
zoneName string
rootDir string
raftStore raftstore.RaftStore
connPool *util.ConnectPool
state uint32
mu sync.RWMutex
partitions map[uint64]MetaPartition // Key: metaRangeId, Val: metaPartition
metaNode *MetaNode
flDeleteBatchCount atomic.Value
fileStatsEnable bool
curQuotaGoroutineNum int32
maxQuotaGoroutineNum int32
cpuUtil atomicutil.Float64
stopC chan struct{}
volUpdating *sync.Map // map[string]*verOp2Phase
verUpdateChan chan string
}
func (m *metadataManager) getPacketLabels(p *Packet) (labels map[string]string) {
labels = make(map[string]string)
labels[exporter.Op] = p.GetOpMsg()
labels[exporter.PartId] = ""
labels[exporter.Vol] = ""
if p.Opcode == proto.OpMetaNodeHeartbeat || p.Opcode == proto.OpCreateMetaPartition {
return
}
mp, err := m.getPartition(p.PartitionID)
if err != nil {
log.LogInfof("[metaManager] getPacketLabels metric packet: %v", p)
return
}
if exporter.EnablePid {
labels[exporter.PartId] = fmt.Sprintf("%d", p.PartitionID)
}
labels[exporter.Vol] = mp.GetBaseConfig().VolName
return
}
// HandleMetadataOperation handles the metadata operations.
func (m *metadataManager) HandleMetadataOperation(conn net.Conn, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if log.EnableInfo() {
log.LogInfof("HandleMetadataOperation input info op (%s), data %s, remote %s", p.String(), string(p.Data), remoteAddr)
}
metric := exporter.NewTPCnt(p.GetOpMsg())
labels := m.getPacketLabels(p)
defer func() {
metric.SetWithLabels(err, labels)
if err != nil {
log.LogWarnf("HandleMetadataOperation output (%s), remote %s, err %s", p.String(), remoteAddr, err.Error())
return
}
if log.EnableInfo() {
log.LogInfof("HandleMetadataOperation out (%s), result (%s), remote %s, cost %s", p.String(),
p.GetResultMsg(), remoteAddr, time.Since(start).String())
}
}()
switch p.Opcode {
case proto.OpMetaCreateInode:
err = m.opCreateInode(conn, p, remoteAddr)
case proto.OpMetaLinkInode:
err = m.opMetaLinkInode(conn, p, remoteAddr)
case proto.OpMetaFreeInodesOnRaftFollower:
err = m.opFreeInodeOnRaftFollower(conn, p, remoteAddr)
case proto.OpMetaUnlinkInode:
err = m.opMetaUnlinkInode(conn, p, remoteAddr)
case proto.OpMetaBatchUnlinkInode:
err = m.opMetaBatchUnlinkInode(conn, p, remoteAddr)
case proto.OpMetaInodeGet:
err = m.opMetaInodeGet(conn, p, remoteAddr)
case proto.OpMetaEvictInode:
err = m.opMetaEvictInode(conn, p, remoteAddr)
case proto.OpMetaBatchEvictInode:
err = m.opBatchMetaEvictInode(conn, p, remoteAddr)
case proto.OpMetaSetattr:
err = m.opSetAttr(conn, p, remoteAddr)
case proto.OpMetaCreateDentry:
err = m.opCreateDentry(conn, p, remoteAddr)
case proto.OpMetaDeleteDentry:
err = m.opDeleteDentry(conn, p, remoteAddr)
case proto.OpMetaBatchDeleteDentry:
err = m.opBatchDeleteDentry(conn, p, remoteAddr)
case proto.OpMetaUpdateDentry:
err = m.opUpdateDentry(conn, p, remoteAddr)
case proto.OpMetaReadDir:
err = m.opReadDir(conn, p, remoteAddr)
case proto.OpMetaReadDirOnly:
err = m.opReadDirOnly(conn, p, remoteAddr)
case proto.OpMetaReadDirLimit:
err = m.opReadDirLimit(conn, p, remoteAddr)
case proto.OpCreateMetaPartition:
err = m.opCreateMetaPartition(conn, p, remoteAddr)
case proto.OpMetaNodeHeartbeat:
err = m.opMasterHeartbeat(conn, p, remoteAddr)
case proto.OpMetaExtentsAdd:
err = m.opMetaExtentsAdd(conn, p, remoteAddr)
case proto.OpMetaExtentAddWithCheck:
err = m.opMetaExtentAddWithCheck(conn, p, remoteAddr)
case proto.OpMetaExtentsList:
err = m.opMetaExtentsList(conn, p, remoteAddr)
case proto.OpMetaObjExtentsList:
err = m.opMetaObjExtentsList(conn, p, remoteAddr)
case proto.OpMetaExtentsDel:
err = m.opMetaExtentsDel(conn, p, remoteAddr)
case proto.OpMetaTruncate:
err = m.opMetaExtentsTruncate(conn, p, remoteAddr)
case proto.OpMetaLookup:
err = m.opMetaLookup(conn, p, remoteAddr)
case proto.OpDeleteMetaPartition:
err = m.opDeleteMetaPartition(conn, p, remoteAddr)
case proto.OpUpdateMetaPartition:
err = m.opUpdateMetaPartition(conn, p, remoteAddr)
case proto.OpLoadMetaPartition:
err = m.opLoadMetaPartition(conn, p, remoteAddr)
case proto.OpDecommissionMetaPartition:
err = m.opDecommissionMetaPartition(conn, p, remoteAddr)
case proto.OpAddMetaPartitionRaftMember:
err = m.opAddMetaPartitionRaftMember(conn, p, remoteAddr)
case proto.OpRemoveMetaPartitionRaftMember:
err = m.opRemoveMetaPartitionRaftMember(conn, p, remoteAddr)
case proto.OpMetaPartitionTryToLeader:
err = m.opMetaPartitionTryToLeader(conn, p, remoteAddr)
case proto.OpMetaBatchInodeGet:
err = m.opMetaBatchInodeGet(conn, p, remoteAddr)
case proto.OpMetaDeleteInode:
err = m.opMetaDeleteInode(conn, p, remoteAddr)
case proto.OpMetaBatchDeleteInode:
err = m.opMetaBatchDeleteInode(conn, p, remoteAddr)
case proto.OpMetaBatchExtentsAdd:
err = m.opMetaBatchExtentsAdd(conn, p, remoteAddr)
case proto.OpMetaBatchObjExtentsAdd:
err = m.opMetaBatchObjExtentsAdd(conn, p, remoteAddr)
case proto.OpMetaClearInodeCache:
err = m.opMetaClearInodeCache(conn, p, remoteAddr)
// operations for extend attributes
case proto.OpMetaSetXAttr:
err = m.opMetaSetXAttr(conn, p, remoteAddr)
case proto.OpMetaBatchSetXAttr:
err = m.opMetaBatchSetXAttr(conn, p, remoteAddr)
case proto.OpMetaGetXAttr:
err = m.opMetaGetXAttr(conn, p, remoteAddr)
case proto.OpMetaGetAllXAttr:
err = m.opMetaGetAllXAttr(conn, p, remoteAddr)
case proto.OpMetaBatchGetXAttr:
err = m.opMetaBatchGetXAttr(conn, p, remoteAddr)
case proto.OpMetaRemoveXAttr:
err = m.opMetaRemoveXAttr(conn, p, remoteAddr)
case proto.OpMetaListXAttr:
err = m.opMetaListXAttr(conn, p, remoteAddr)
case proto.OpMetaUpdateXAttr:
err = m.opMetaUpdateXAttr(conn, p, remoteAddr)
// operations for multipart session
case proto.OpCreateMultipart:
err = m.opCreateMultipart(conn, p, remoteAddr)
case proto.OpListMultiparts:
err = m.opListMultipart(conn, p, remoteAddr)
case proto.OpRemoveMultipart:
err = m.opRemoveMultipart(conn, p, remoteAddr)
case proto.OpAddMultipartPart:
err = m.opAppendMultipart(conn, p, remoteAddr)
case proto.OpGetMultipart:
err = m.opGetMultipart(conn, p, remoteAddr)
// operations for transactions
case proto.OpMetaTxCreateInode:
err = m.opTxCreateInode(conn, p, remoteAddr)
case proto.OpMetaTxCreateDentry:
err = m.opTxCreateDentry(conn, p, remoteAddr)
case proto.OpTxCommit:
err = m.opTxCommit(conn, p, remoteAddr)
case proto.OpMetaTxCreate:
err = m.opTxCreate(conn, p, remoteAddr)
case proto.OpMetaTxGet:
err = m.opTxGet(conn, p, remoteAddr)
case proto.OpTxCommitRM:
err = m.opTxCommitRM(conn, p, remoteAddr)
case proto.OpTxRollbackRM:
err = m.opTxRollbackRM(conn, p, remoteAddr)
case proto.OpTxRollback:
err = m.opTxRollback(conn, p, remoteAddr)
case proto.OpMetaTxDeleteDentry:
err = m.opTxDeleteDentry(conn, p, remoteAddr)
case proto.OpMetaTxUnlinkInode:
err = m.opTxMetaUnlinkInode(conn, p, remoteAddr)
case proto.OpMetaTxUpdateDentry:
err = m.opTxUpdateDentry(conn, p, remoteAddr)
case proto.OpMetaTxLinkInode:
err = m.opTxMetaLinkInode(conn, p, remoteAddr)
case proto.OpMetaBatchSetInodeQuota:
err = m.opMetaBatchSetInodeQuota(conn, p, remoteAddr)
case proto.OpMetaBatchDeleteInodeQuota:
err = m.opMetaBatchDeleteInodeQuota(conn, p, remoteAddr)
case proto.OpMetaGetInodeQuota:
err = m.opMetaGetInodeQuota(conn, p, remoteAddr)
case proto.OpQuotaCreateInode:
err = m.opQuotaCreateInode(conn, p, remoteAddr)
case proto.OpQuotaCreateDentry:
err = m.opQuotaCreateDentry(conn, p, remoteAddr)
case proto.OpMetaGetUniqID:
err = m.opMetaGetUniqID(conn, p, remoteAddr)
// multi version
case proto.OpVersionOperation:
err = m.opMultiVersionOp(conn, p, remoteAddr)
case proto.OpGetExpiredMultipart:
err = m.opGetExpiredMultipart(conn, p, remoteAddr)
default:
err = fmt.Errorf("%s unknown Opcode: %d, reqId: %d", remoteAddr,
p.Opcode, p.GetReqID())
}
if err != nil {
err = errors.NewErrorf("%s [%s] req: %d - %s", remoteAddr, p.GetOpMsg(),
p.GetReqID(), err.Error())
}
return
}
// Start starts the metadata manager.
func (m *metadataManager) Start() (err error) {
if atomic.CompareAndSwapUint32(&m.state, common.StateStandby, common.StateStart) {
defer func() {
var newState uint32
if err != nil {
newState = common.StateStandby
} else {
newState = common.StateRunning
}
atomic.StoreUint32(&m.state, newState)
}()
err = m.onStart()
}
return
}
// Stop stops the metadata manager.
func (m *metadataManager) Stop() {
if atomic.CompareAndSwapUint32(&m.state, common.StateRunning, common.StateShutdown) {
defer atomic.StoreUint32(&m.state, common.StateStopped)
m.onStop()
}
}
func (m *metadataManager) startCpuSample() {
// async sample cpu util
go func() {
for {
select {
case <-m.stopC:
return
default:
used, err := loadutil.GetCpuUtilPercent(sampleDuration)
if err == nil {
m.cpuUtil.Store(used)
}
}
}
}()
}
func (m *metadataManager) startSnapshotVersionPromote() {
m.verUpdateChan = make(chan string, 1000)
go func() {
for {
select {
case volName := <-m.verUpdateChan:
m.checkAndPromoteVersion(volName)
case <-m.stopC:
return
}
}
}()
}
// onStart creates the connection pool and loads the partitions.
func (m *metadataManager) onStart() (err error) {
m.connPool = util.NewConnectPool()
err = m.loadPartitions()
if err != nil {
return
}
m.stopC = make(chan struct{})
// start sampler
m.startCpuSample()
m.startSnapshotVersionPromote()
return
}
// onStop stops each meta partitions.
func (m *metadataManager) onStop() {
if m.partitions != nil {
for _, partition := range m.partitions {
partition.Stop()
}
// stop sampler
close(m.stopC)
}
return
}
// LoadMetaPartition returns the meta partition with the specified volName.
func (m *metadataManager) getPartition(id uint64) (mp MetaPartition, err error) {
m.mu.RLock()
defer m.mu.RUnlock()
mp, ok := m.partitions[id]
if ok {
return
}
err = errors.New(fmt.Sprintf("unknown meta partition: %d", id))
return
}
func (m *metadataManager) loadPartitions() (err error) {
var metaNodeInfo *proto.MetaNodeInfo
for i := 0; i < 3; i++ {
if metaNodeInfo, err = masterClient.NodeAPI().GetMetaNode(fmt.Sprintf("%s:%s", m.metaNode.localAddr,
m.metaNode.listen)); err != nil {
log.LogWarnf("loadPartitions: get MetaNode info fail: err(%v)", err)
continue
}
break
}
if err != nil {
log.LogErrorf("loadPartitions: get MetaNode info fail: err(%v)", err)
return
}
if len(metaNodeInfo.PersistenceMetaPartitions) == 0 {
log.LogWarnf("loadPartitions: length of PersistenceMetaPartitions is 0, ExpiredPartition check without effect")
}
// Check metadataDir directory
fileInfo, err := os.Stat(m.rootDir)
if err != nil {
os.MkdirAll(m.rootDir, 0o755)
err = nil
return
}
if !fileInfo.IsDir() {
err = errors.New("metadataDir must be directory")
return
}
// scan the data directory
fileInfoList, err := os.ReadDir(m.rootDir)
if err != nil {
return
}
syslog.Println("Start loadPartitions!!!")
var wg sync.WaitGroup
for _, fileInfo := range fileInfoList {
if fileInfo.IsDir() && strings.HasPrefix(fileInfo.Name(), partitionPrefix) {
if isExpiredPartition(fileInfo.Name(), metaNodeInfo.PersistenceMetaPartitions) {
log.LogErrorf("loadPartitions: find expired partition[%s], rename it and you can delete it manually",
fileInfo.Name())
oldName := path.Join(m.rootDir, fileInfo.Name())
newName := path.Join(m.rootDir, ExpiredPartitionPrefix+fileInfo.Name())
os.Rename(oldName, newName)
continue
}
wg.Add(1)
go func(fileName string) {
var errload error
defer func() {
if r := recover(); r != nil {
log.LogWarnf("action[loadPartitions] recovered when load partition, skip it,"+
" partition: %s, error: %s, failed: %v", fileName, errload, r)
syslog.Printf("load meta partition %v fail: %v", fileName, r)
} else if errload != nil {
log.LogWarnf("action[loadPartitions] failed to load partition, skip it, partition: %s, error: %s",
fileName, errload)
}
}()
defer wg.Done()
if len(fileName) < 10 {
log.LogWarnf("ignore unknown partition dir: %s", fileName)
return
}
var id uint64
partitionId := fileName[len(partitionPrefix):]
id, errload = strconv.ParseUint(partitionId, 10, 64)
if errload != nil {
log.LogWarnf("action[loadPartitions] ignore path: %s, not partition", partitionId)
return
}
partitionConfig := &MetaPartitionConfig{
PartitionId: id,
NodeId: m.nodeId,
RaftStore: m.raftStore,
RootDir: path.Join(m.rootDir, fileName),
ConnPool: m.connPool,
}
partitionConfig.AfterStop = func() {
m.detachPartition(id)
}
// check snapshot dir or backup
snapshotDir := path.Join(partitionConfig.RootDir, snapshotDir)
if _, errload = os.Stat(snapshotDir); errload != nil {
backupDir := path.Join(partitionConfig.RootDir, snapshotBackup)
if _, errload = os.Stat(backupDir); errload == nil {
if errload = os.Rename(backupDir, snapshotDir); errload != nil {
errload = errors.Trace(errload,
fmt.Sprintf(": fail recover backup snapshot %s",
snapshotDir))
return
}
}
errload = nil
}
partition := NewMetaPartition(partitionConfig, m)
if partition == nil {
log.LogErrorf("action[loadPartitions]: NewMetaPartition is nil")
return
}
errload = m.attachPartition(id, partition)
if errload != nil {
log.LogErrorf("action[loadPartitions] load partition id=%d failed: %s.",
id, errload.Error())
}
}(fileInfo.Name())
}
}
wg.Wait()
syslog.Println("Finish loadPartitions!!!")
return
}
func (m *metadataManager) attachPartition(id uint64, partition MetaPartition) (err error) {
syslog.Println(fmt.Sprintf("start load metaPartition %v", id))
partition.ForceSetMetaPartitionToLoadding()
if err = partition.Start(false); err != nil {
msg := fmt.Sprintf("load meta partition %v fail: %v", id, err)
log.LogError(msg)
syslog.Println(msg)
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.partitions[id] = partition
msg := fmt.Sprintf("load meta partition %v success", id)
log.LogInfof(msg)
syslog.Println(msg)
return
}
func (m *metadataManager) detachPartition(id uint64) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
if _, has := m.partitions[id]; has {
delete(m.partitions, id)
} else {
err = fmt.Errorf("unknown partition: %d", id)
}
return
}
func (m *metadataManager) createPartition(request *proto.CreateMetaPartitionRequest) (err error) {
partitionId := fmt.Sprintf("%d", request.PartitionID)
log.LogInfof("start create meta Partition, partition %s", partitionId)
mpc := &MetaPartitionConfig{
PartitionId: request.PartitionID,
VolName: request.VolName,
Start: request.Start,
End: request.End,
Cursor: request.Start,
UniqId: 0,
Peers: request.Members,
RaftStore: m.raftStore,
NodeId: m.nodeId,
RootDir: path.Join(m.rootDir, partitionPrefix+partitionId),
ConnPool: m.connPool,
VerSeq: request.VerSeq,
}
mpc.AfterStop = func() {
m.detachPartition(request.PartitionID)
}
partition := NewMetaPartition(mpc, m)
if partition == nil {
err = errors.NewErrorf("[createPartition] partition is nil")
return
}
if err = partition.RenameStaleMetadata(); err != nil {
err = errors.NewErrorf("[createPartition]->%s", err.Error())
}
if err = partition.PersistMetadata(); err != nil {
err = errors.NewErrorf("[createPartition]->%s", err.Error())
return
}
if err = partition.Start(true); err != nil {
os.RemoveAll(mpc.RootDir)
log.LogErrorf("load meta partition %v fail: %v", request.PartitionID, err)
err = errors.NewErrorf("[createPartition]->%s", err.Error())
return
}
m.mu.Lock()
defer m.mu.Unlock()
if oldMp, ok := m.partitions[request.PartitionID]; ok {
err = oldMp.IsEquareCreateMetaPartitionRequst(request)
partition.Stop()
partition.DeleteRaft()
os.RemoveAll(mpc.RootDir)
return
}
m.partitions[request.PartitionID] = partition
log.LogInfof("load meta partition %v success", request.PartitionID)
return
}
func (m *metadataManager) deletePartition(id uint64) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
mp, has := m.partitions[id]
if !has {
return
}
mp.Reset()
delete(m.partitions, id)
return
}
// Range scans all the meta partitions.
func (m *metadataManager) Range(needLock bool, f func(i uint64, p MetaPartition) bool) {
if needLock {
m.mu.RLock()
defer m.mu.RUnlock()
}
for k, v := range m.partitions {
if !f(k, v) {
return
}
}
}
// GetPartition returns the meta partition with the given ID.
func (m *metadataManager) GetPartition(id uint64) (mp MetaPartition, err error) {
mp, err = m.getPartition(id)
return
}
// MarshalJSON only marshals the base information of every partition.
func (m *metadataManager) MarshalJSON() (data []byte, err error) {
m.mu.RLock()
defer m.mu.RUnlock()
return json.Marshal(m.partitions)
}
func (m *metadataManager) QuotaGoroutineIsOver() (lsOver bool) {
log.LogInfof("QuotaGoroutineIsOver cur [%v] max [%v]", m.curQuotaGoroutineNum, m.maxQuotaGoroutineNum)
if atomic.LoadInt32(&m.curQuotaGoroutineNum) >= m.maxQuotaGoroutineNum {
return true
}
return false
}
func (m *metadataManager) QuotaGoroutineInc(num int32) {
atomic.AddInt32(&m.curQuotaGoroutineNum, num)
}
func (m *metadataManager) GetLeaderPartitions() map[uint64]MetaPartition {
m.mu.RLock()
defer m.mu.RUnlock()
mps := make(map[uint64]MetaPartition)
for addr, mp := range m.partitions {
if _, leader := mp.IsLeader(); leader {
mps[addr] = mp
}
}
return mps
}
// NewMetadataManager returns a new metadata manager.
func NewMetadataManager(conf MetadataManagerConfig, metaNode *MetaNode) MetadataManager {
return &metadataManager{
nodeId: conf.NodeID,
zoneName: conf.ZoneName,
rootDir: conf.RootDir,
raftStore: conf.RaftStore,
partitions: make(map[uint64]MetaPartition),
metaNode: metaNode,
maxQuotaGoroutineNum: defaultMaxQuotaGoroutine,
volUpdating: new(sync.Map),
}
}
// isExpiredPartition return whether one partition is expired
// if one partition does not exist in master, we decided that it is one expired partition
func isExpiredPartition(fileName string, partitions []uint64) (expiredPartition bool) {
if len(partitions) == 0 {
return true
}
partitionId := fileName[len(partitionPrefix):]
id, err := strconv.ParseUint(partitionId, 10, 64)
if err != nil {
log.LogWarnf("isExpiredPartition: %s, check error [%v], skip this check", partitionId, err)
return true
}
for _, existId := range partitions {
if existId == id {
return false
}
}
return true
}
// Copyright 2018 The The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/json"
"fmt"
"net"
"os"
"runtime"
"sync"
"sync/atomic"
"time"
raftProto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
MaxUsedMemFactor = 1.1
)
func (m *metadataManager) checkFollowerRead(volNames []string, partition MetaPartition) {
volName := partition.GetVolName()
for _, name := range volNames {
if name == volName {
partition.SetFollowerRead(true)
return
}
}
partition.SetFollowerRead(false)
return
}
func (m *metadataManager) checkForbiddenVolume(volNames []string, partition MetaPartition) {
volName := partition.GetVolName()
for _, name := range volNames {
if name == volName {
partition.SetForbidden(true)
return
}
}
partition.SetForbidden(false)
return
}
func (m *metadataManager) checkDisableAuditLogVolume(volNames []string, partition MetaPartition) {
volName := partition.GetVolName()
for _, name := range volNames {
if name == volName {
partition.SetEnableAuditLog(false)
return
}
}
partition.SetEnableAuditLog(true)
return
}
func (m *metadataManager) opMasterHeartbeat(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
// For ack to master
data := p.Data
m.responseAckOKToMaster(conn, p)
var (
req = &proto.HeartBeatRequest{}
resp = &proto.MetaNodeHeartbeatResponse{}
adminTask = &proto.AdminTask{
Request: req,
}
)
go func() {
start := time.Now()
decode := json.NewDecoder(bytes.NewBuffer(data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
resp.Status = proto.TaskFailed
resp.Result = err.Error()
goto end
}
m.fileStatsEnable = req.FileStatsEnable
// collect memory info
resp.Total = configTotalMem
resp.MemUsed, err = util.GetProcessMemory(os.Getpid())
if err != nil {
adminTask.Status = proto.TaskFailed
goto end
}
// set cpu util and io used in here
resp.CpuUtil = m.cpuUtil.Load()
m.Range(true, func(id uint64, partition MetaPartition) bool {
m.checkFollowerRead(req.FLReadVols, partition)
m.checkForbiddenVolume(req.ForbiddenVols, partition)
m.checkDisableAuditLogVolume(req.DisableAuditVols, partition)
partition.SetUidLimit(req.UidLimitInfo)
partition.SetTxInfo(req.TxInfo)
partition.setQuotaHbInfo(req.QuotaHbInfos)
mConf := partition.GetBaseConfig()
mpr := &proto.MetaPartitionReport{
PartitionID: mConf.PartitionId,
Start: mConf.Start,
End: mConf.End,
Status: proto.ReadWrite,
MaxInodeID: mConf.Cursor,
VolName: mConf.VolName,
Size: partition.DataSize(),
InodeCnt: uint64(partition.GetInodeTreeLen()),
DentryCnt: uint64(partition.GetDentryTreeLen()),
FreeListLen: uint64(partition.GetFreeListLen()),
UidInfo: partition.GetUidInfo(),
QuotaReportInfos: partition.getQuotaReportInfos(),
}
mpr.TxCnt, mpr.TxRbInoCnt, mpr.TxRbDenCnt = partition.TxGetCnt()
if mConf.Cursor >= mConf.End {
mpr.Status = proto.ReadOnly
}
if resp.MemUsed > uint64(float64(resp.Total)*MaxUsedMemFactor) {
mpr.Status = proto.ReadOnly
}
addr, isLeader := partition.IsLeader()
if addr == "" {
mpr.Status = proto.Unavailable
}
mpr.IsLeader = isLeader
resp.MetaPartitionReports = append(resp.MetaPartitionReports, mpr)
return true
})
resp.ZoneName = m.zoneName
resp.Status = proto.TaskSucceeds
end:
adminTask.Request = nil
adminTask.Response = resp
m.respondToMaster(adminTask)
if log.EnableInfo() {
log.LogInfof("%s pkt %s, resp success req:%v; respAdminTask: %v, cost %s",
remoteAddr, p.String(), req, adminTask, time.Since(start).String())
}
}()
return
}
func (m *metadataManager) opCreateMetaPartition(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
defer func() {
var buf []byte
status := proto.OpOk
if err != nil {
status = proto.OpErr
buf = []byte(err.Error())
}
p.PacketErrorWithBody(status, buf)
m.respondToClientWithVer(conn, p)
}()
req := &proto.CreateMetaPartitionRequest{}
adminTask := &proto.AdminTask{
Request: req,
}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
err = errors.NewErrorf("[opCreateMetaPartition]: Unmarshal AdminTask"+
" struct: %s", err.Error())
return
}
log.LogInfof("[%s] [remoteAddr=%s]accept a from"+
" master message: %v", p.String(), remoteAddr, adminTask)
// create a new meta partition.
if err = m.createPartition(req); err != nil {
err = errors.NewErrorf("[opCreateMetaPartition]->%s; request message: %v",
err.Error(), adminTask.Request)
return
}
log.LogInfof("%s [%s] create success req:%v; resp: %v", remoteAddr, p.String(),
req, adminTask)
return
}
// Handle OpCreate inode.
func (m *metadataManager) opCreateInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &CreateInoReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.CreateInode(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
// reply the operation result to the client through TCP
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opCreateInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opQuotaCreateInode(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.QuotaCreateInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.QuotaCreateInode(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
// reply the operation result to the client through TCP
m.respondToClient(conn, p)
log.LogDebugf("%s [opQuotaCreateInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxMetaLinkInode(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.TxLinkInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.TxCreateInodeLink(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxMetaLinkInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaLinkInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &LinkInodeReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.CreateInodeLink(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaLinkInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Handle OpCreate
func (m *metadataManager) opFreeInodeOnRaftFollower(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
mp, err := m.getPartition(p.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],err[%v]", p.GetOpMsgWithReqAndResult(), string(p.Data))
return
}
mp.(*metaPartition).internalDelete(p.Data[:p.Size])
p.PacketOkReply()
m.respondToClientWithVer(conn, p)
return
}
// Handle OpCreate
func (m *metadataManager) opTxCreateDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxCreateDentryRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.TxCreateDentry(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxCreateDentry] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxCreate(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxCreateRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxCreate(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxCreate] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxGet(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxGetInfoRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.Pid)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxGetInfo(req, p)
m.respondToClient(conn, p)
if log.EnableDebug() {
log.LogDebugf("%s [opTxGet] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
}
return
}
func (m *metadataManager) opTxCommitRM(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxApplyRMRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxCommitRM(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxCommitRM] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxRollbackRM(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxApplyRMRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxRollbackRM(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxRollbackRM] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxCommit(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxApplyRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.TmID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxCommit(req, p, remoteAddr)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxCommit] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxRollback(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxApplyRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.TmID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxRollback(req, p, remoteAddr)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxRollback] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Handle OpCreate
func (m *metadataManager) opCreateDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &CreateDentryReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.CreateDentry(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opCreateDentry] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opQuotaCreateDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.QuotaCreateDentryRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.QuotaCreateDentry(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opQuotaCreateDentry] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Handle OpDelete Dentry
func (m *metadataManager) opTxDeleteDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxDeleteDentryRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxDeleteDentry(req, p, remoteAddr)
m.respondToClient(conn, p)
if log.EnableDebug() {
log.LogDebugf("%s [opTxDeleteDentry] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
}
return
}
// Handle OpDelete Dentry
func (m *metadataManager) opDeleteDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &DeleteDentryReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.DeleteDentry(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opDeleteDentry] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Handle Op batch Delete Dentry
func (m *metadataManager) opBatchDeleteDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &BatchDeleteDentryReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.DeleteDentryBatch(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opDeleteDentry] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxUpdateDentry(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.TxUpdateDentryRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.TxUpdateDentry(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opTxUpdateDentry] req: %d - %v; resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opUpdateDentry(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &UpdateDentryReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.UpdateDentry(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opUpdateDentry] req: %d - %v; resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opTxMetaUnlinkInode(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.TxUnlinkInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.TxUnlinkInode(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opDeleteInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaUnlinkInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &UnlinkInoReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.UnlinkInode(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opDeleteInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchUnlinkInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &BatchUnlinkInoReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.UnlinkInodeBatch(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opDeleteInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opReadDirOnly(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.ReadDirOnlyRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ReadDirOnly(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [%v]req: %v , resp: %v, body: %s", remoteAddr,
p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Handle OpReadDir
func (m *metadataManager) opReadDir(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.ReadDirRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ReadDir(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [%v]req: %v , resp: %v, body: %s", remoteAddr,
p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Handle OpReadDirLimit
func (m *metadataManager) opReadDirLimit(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.ReadDirLimitRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ReadDirLimit(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [%v]req: %v , resp: %v, body: %s", remoteAddr,
p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaInodeGet(conn net.Conn, p *Packet,
remoteAddr string) (err error,
) {
req := &InodeGetReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("Unmarshal [%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
log.LogDebugf("action[opMetaInodeGet] request %v", req)
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("getPartition [%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
if err = mp.InodeGet(req, p); err != nil {
err = errors.NewErrorf("InodeGet [%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
}
if err = m.respondToClient(conn, p); err != nil {
log.LogDebugf("%s [opMetaInodeGet] err [%v] req: %d - %v; resp: %v, body: %s",
remoteAddr, err, p.GetReqID(), req, p.GetResultMsg(), p.Data)
}
log.LogDebugf("%s [opMetaInodeGet] req: %d - %v; resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
if value, ok := m.volUpdating.Load(req.VolName); ok {
ver2Phase := value.(*verOp2Phase)
if ver2Phase.verSeq > req.VerSeq {
// reuse ExtentType to identify flag of version inconsistent between metanode and client
// will resp to client and make client update all streamer's extent and it's verSeq
p.ExtentType |= proto.MultiVersionFlag
p.VerSeq = ver2Phase.verSeq
}
}
return
}
func (m *metadataManager) opBatchMetaEvictInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.BatchEvictInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] request unmarshal: %v", p.GetOpMsgWithReqAndResult(), err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
if err = mp.EvictInodeBatch(req, p, remoteAddr); err != nil {
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
}
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opBatchMetaEvictInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaEvictInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.EvictInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
if err = mp.EvictInode(req, p, remoteAddr); err != nil {
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
}
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaEvictInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opSetAttr(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &SetattrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
if err = mp.SetAttr(req, p.Data, p); err != nil {
err = errors.NewErrorf("[opSetAttr] req: %v, error: %s", req, err.Error())
}
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opSetAttr] req: %d - %v, resp: %v, body: %s", remoteAddr,
p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Lookup request
func (m *metadataManager) opMetaLookup(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.LookupRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.Lookup(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opMetaLookup] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaExtentsAdd(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.AppendExtentKeyRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.ExtentAppend(req, p)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
if err != nil {
log.LogErrorf("%s [opMetaExtentsAdd] ExtentAppend: %s, "+
"response to client: %s", remoteAddr, err.Error(), p.GetResultMsg())
}
log.LogDebugf("%s [opMetaExtentsAdd] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Append one extent with discard check
func (m *metadataManager) opMetaExtentAddWithCheck(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.AppendExtentKeyWithCheckRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
if err = mp.ExtentAppendWithCheck(req, p); err != nil {
log.LogErrorf("%s [opMetaExtentAddWithCheck] ExtentAppendWithCheck: %s", remoteAddr, err.Error())
}
m.updatePackRspSeq(mp, p)
if err = m.respondToClientWithVer(conn, p); err != nil {
log.LogErrorf("%s [opMetaExtentAddWithCheck] ExtentAppendWithCheck: %s, "+
"response to client: %s", remoteAddr, err.Error(), p.GetResultMsg())
}
log.LogDebugf("%s [opMetaExtentAddWithCheck] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaExtentsList(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.GetExtentsRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ExtentsList(req, p)
m.respondToClient(conn, p)
if log.EnableDebug() {
log.LogDebugf("%s [opMetaExtentsList] req: %d - %v; resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), log.TruncMsg(string(p.Data)))
}
return
}
func (m *metadataManager) opMetaObjExtentsList(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.GetExtentsRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ObjExtentsList(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opMetaObjExtentsList] req: %d - %v; resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaExtentsDel(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
panic("not implemented yet")
// req := &proto.DelExtentKeyRequest{}
// if err = json.Unmarshal(p.Data, req); err != nil {
// p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
// m.respondToClientWithVer(conn, p)
// err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
// return
// }
// mp, err := m.getPartition(req.PartitionID)
// if err != nil {
// p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
// m.respondToClientWithVer(conn, p)
// err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
// return
// }
// if !m.serveProxy(conn, mp, p) {
// return
// }
// mp.ExtentsDelete(req, p)
// m.respondToClientWithVer(conn, p)
// log.LogDebugf("%s [OpMetaTruncate] req: %d - %v, resp body: %v, "+
// "resp body: %s", remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
// return
}
func (m *metadataManager) opMetaExtentsTruncate(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &ExtentsTruncateReq{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
mp.ExtentsTruncate(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [OpMetaTruncate] req: %d - %v, resp body: %v, "+
"resp body: %s", remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaClearInodeCache(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.ClearInodeCacheRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.ClearInodeCache(req, p)
m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaClearInodeCache] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
// Delete a meta partition.
func (m *metadataManager) opDeleteMetaPartition(conn net.Conn,
p *Packet, remoteAddr string) (err error) {
req := &proto.DeleteMetaPartitionRequest{}
adminTask := &proto.AdminTask{
Request: req,
}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketOkReply()
m.respondToClientWithVer(conn, p)
return
}
// Ack the master request
conf := mp.GetBaseConfig()
mp.Stop()
mp.DeleteRaft()
m.deletePartition(mp.GetBaseConfig().PartitionId)
os.RemoveAll(conf.RootDir)
p.PacketOkReply()
m.respondToClientWithVer(conn, p)
runtime.GC()
log.LogInfof("%s [opDeleteMetaPartition] req: %d - %v, resp: %v",
remoteAddr, p.GetReqID(), req, err)
return
}
func (m *metadataManager) opUpdateMetaPartition(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := new(UpdatePartitionReq)
adminTask := &proto.AdminTask{
Request: req,
}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
m.responseAckOKToMaster(conn, p)
resp := &UpdatePartitionResp{
VolName: req.VolName,
PartitionID: req.PartitionID,
End: req.End,
}
err = mp.UpdatePartition(req, resp)
adminTask.Response = resp
adminTask.Request = nil
m.respondToMaster(adminTask)
log.LogInfof("%s [opUpdateMetaPartition] req[%v], response[%v].",
remoteAddr, req, adminTask)
return
}
func (m *metadataManager) opLoadMetaPartition(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.MetaPartitionLoadRequest{}
adminTask := &proto.AdminTask{
Request: req,
}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if err = mp.ResponseLoadMetaPartition(p); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
log.LogErrorf("%s [opLoadMetaPartition] req[%v], "+
"response marshal[%v]", remoteAddr, req, err.Error())
m.respondToClient(conn, p)
return
}
m.respondToClient(conn, p)
log.LogInfof("%s [opLoadMetaPartition] req[%v], response status[%s], "+
"response body[%s], error[%v]", remoteAddr, req, p.GetResultMsg(), p.Data,
err)
return
}
func (m *metadataManager) opDecommissionMetaPartition(conn net.Conn,
p *Packet, remoteAddr string) (err error) {
var reqData []byte
req := &proto.MetaPartitionDecommissionRequest{}
adminTask := &proto.AdminTask{
Request: req,
}
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return err
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return err
}
if !m.serveProxy(conn, mp, p) {
return nil
}
if req.AddPeer.ID == req.RemovePeer.ID {
err = errors.NewErrorf("[opDecommissionMetaPartition]: AddPeer[%v] same withRemovePeer[%v]", req.AddPeer, req.RemovePeer)
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return
}
reqData, err = json.Marshal(req)
if err != nil {
err = errors.NewErrorf("[opDecommissionMetaPartition]: partitionID= %d, "+
"Marshal %s", req.PartitionID, err)
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return
}
_, err = mp.ChangeMember(raftProto.ConfAddNode,
raftProto.Peer{ID: req.AddPeer.ID}, reqData)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return err
}
_, err = mp.ChangeMember(raftProto.ConfRemoveNode,
raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return err
}
p.PacketOkReply()
m.respondToClientWithVer(conn, p)
return
}
func (m *metadataManager) opAddMetaPartitionRaftMember(conn net.Conn,
p *Packet, remoteAddr string) (err error) {
var reqData []byte
req := &proto.AddMetaPartitionRaftMemberRequest{}
adminTask := &proto.AdminTask{
Request: req,
}
defer func() {
if err != nil {
log.LogInfof("pkt %s remote %s reqId add raft member failed, req %v, err %s", p.String(), remoteAddr, adminTask, err.Error())
return
}
log.LogInfof("pkt %s, remote %s add raft member success, req %v", p.String(), remoteAddr, adminTask)
}()
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return err
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpTryOtherAddr, ([]byte)(proto.ErrMetaPartitionNotExists.Error()))
m.respondToClientWithVer(conn, p)
return err
}
if mp.IsExsitPeer(req.AddPeer) {
p.PacketOkReply()
m.respondToClientWithVer(conn, p)
return
}
log.LogInfof("[%s], remote %s start add raft member, req %v", p.String(), remoteAddr, adminTask)
if !m.serveProxy(conn, mp, p) {
return nil
}
reqData, err = json.Marshal(req)
if err != nil {
err = errors.NewErrorf("[opAddMetaPartitionRaftMember]: partitionID= %d, "+
"Marshal %s", req.PartitionId, err)
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return
}
if req.AddPeer.ID == 0 {
err = errors.NewErrorf("[opAddMetaPartitionRaftMember]: partitionID= %d, "+
"Marshal %s", req.PartitionId, fmt.Sprintf("unavali AddPeerID %v", req.AddPeer.ID))
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return
}
_, err = mp.ChangeMember(raftProto.ConfAddNode,
raftProto.Peer{ID: req.AddPeer.ID}, reqData)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
return err
}
p.PacketOkReply()
m.respondToClientWithVer(conn, p)
return
}
func (m *metadataManager) opRemoveMetaPartitionRaftMember(conn net.Conn,
p *Packet, remoteAddr string) (err error) {
var reqData []byte
req := &proto.RemoveMetaPartitionRaftMemberRequest{}
adminTask := &proto.AdminTask{
Request: req,
}
defer func() {
if err != nil {
log.LogInfof("[%s], remote %s remove raft member failed, req %v, err %s", p.String(), remoteAddr, adminTask, err.Error())
return
}
log.LogInfof("[%s], remote %s remove raft member success, req %v", p.String(), remoteAddr, adminTask)
}()
decode := json.NewDecoder(bytes.NewBuffer(p.Data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return err
}
log.LogInfof("[%s], remote %s remove raft member success, req %v", p.String(), remoteAddr, adminTask)
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return err
}
if !mp.IsExsitPeer(req.RemovePeer) {
p.PacketOkReply()
m.respondToClient(conn, p)
return
}
if !m.serveProxy(conn, mp, p) {
return nil
}
reqData, err = json.Marshal(req)
if err != nil {
err = errors.NewErrorf("[opRemoveMetaPartitionRaftMember]: partitionID= %d, "+
"Marshal %s", req.PartitionId, err)
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return
}
if err = mp.CanRemoveRaftMember(req.RemovePeer); err != nil {
err = errors.NewErrorf("[opRemoveMetaPartitionRaftMember]: partitionID= %d, "+
"Marshal %s", req.PartitionId, fmt.Sprintf("unavali RemovePeerID %v", req.RemovePeer.ID))
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return
}
if req.RemovePeer.ID == 0 {
err = errors.NewErrorf("[opRemoveMetaPartitionRaftMember]: partitionID= %d, "+
"Marshal %s", req.PartitionId, fmt.Sprintf("unavali RemovePeerID %v", req.RemovePeer.ID))
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return
}
_, err = mp.ChangeMember(raftProto.ConfRemoveNode,
raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return err
}
p.PacketOkReply()
m.respondToClient(conn, p)
return
}
func (m *metadataManager) opMetaBatchInodeGet(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.BatchInodeGetRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
log.LogDebugf("action[opMetaBatchInodeGet] req %v", req)
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.InodeGetBatch(req, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opMetaBatchInodeGet] req: %d - %v, resp: %v, "+
"body: %s", remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaPartitionTryToLeader(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
mp, err := m.getPartition(p.PartitionID)
if err != nil {
goto errDeal
}
if err = mp.TryToLeader(p.PartitionID); err != nil {
goto errDeal
}
p.PacketOkReply()
m.respondToClient(conn, p)
return
errDeal:
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return
}
func (m *metadataManager) opMetaDeleteInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.DeleteInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.DeleteInode(req, p, remoteAddr)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaDeleteInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchDeleteInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
var req *proto.DeleteInodeBatchRequest
if err = json.Unmarshal(p.Data, &req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.DeleteInodeBatch(req, p, remoteAddr)
log.LogDebugf("%s [opMetaDeleteInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
_ = m.respondToClientWithVer(conn, p)
return
}
func (m *metadataManager) opMetaUpdateXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.UpdateXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.UpdateXAttr(req, p)
m.updatePackRspSeq(mp, p)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaSetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaSetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.SetXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.SetXAttr(req, p)
m.updatePackRspSeq(mp, p)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaSetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchSetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.BatchSetXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.BatchSetXAttr(req, p)
m.updatePackRspSeq(mp, p)
_ = m.respondToClient(conn, p)
log.LogDebugf("%s [OpMetaBatchSetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaGetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.GetXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.GetXAttr(req, p)
_ = m.respondToClient(conn, p)
log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaGetAllXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.GetAllXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.GetAllXAttr(req, p)
_ = m.respondToClient(conn, p)
log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchGetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.BatchGetXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.BatchGetXAttr(req, p)
_ = m.respondToClient(conn, p)
log.LogDebugf("%s [opMetaBatchGetXAttr req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaRemoveXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.RemoveXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.RemoveXAttr(req, p)
m.updatePackRspSeq(mp, p)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaListXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.ListXAttrRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ListXAttr(req, p)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchExtentsAdd(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.AppendExtentKeysRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
err = mp.BatchExtentAppend(req, p)
m.updatePackRspSeq(mp, p)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaBatchExtentsAdd] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchObjExtentsAdd(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.AppendObjExtentKeysRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.BatchObjExtentAppend(req, p)
_ = m.respondToClientWithVer(conn, p)
log.LogDebugf("%s [opMetaBatchObjExtentsAdd] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opCreateMultipart(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.CreateMultipartRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClientWithVer(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.CreateMultipart(req, p)
_ = m.respondToClientWithVer(conn, p)
return
}
func (m *metadataManager) opRemoveMultipart(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.RemoveMultipartRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.RemoveMultipart(req, p)
_ = m.respondToClient(conn, p)
return
}
func (m *metadataManager) opGetExpiredMultipart(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.GetExpiredMultipartRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opGetExpiredMultipart] req: %v, resp: %v", req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.GetExpiredMultipart(req, p)
_ = m.respondToClient(conn, p)
return
}
func (m *metadataManager) opGetMultipart(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.GetMultipartRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.GetMultipart(req, p)
_ = m.respondToClient(conn, p)
return
}
func (m *metadataManager) opAppendMultipart(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.AddMultipartPartRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.respondToClientWithVer(conn, p)
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.respondToClientWithVer(conn, p)
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.AppendMultipart(req, p)
_ = m.respondToClientWithVer(conn, p)
return
}
func (m *metadataManager) opListMultipart(conn net.Conn, p *Packet, remoteAddr string) (err error) {
req := &proto.ListMultipartRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opListMultipart] req: %v, resp: %v", req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opListMultipart] req: %v, resp: %v", req, err.Error())
return
}
if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
return
}
err = mp.ListMultipart(req, p)
_ = m.respondToClient(conn, p)
return
}
// Handle OpMetaTxCreateInode inode.
func (m *metadataManager) opTxCreateInode(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.TxCreateInodeRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
return
}
if err = m.checkMultiVersionStatus(mp, p); err != nil {
err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
m.respondToClientWithVer(conn, p)
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.TxCreateInode(req, p, remoteAddr)
m.updatePackRspSeq(mp, p)
m.respondToClient(conn, p)
log.LogDebugf("%s [opTxCreateInode] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) opMetaBatchSetInodeQuota(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.BatchSetMetaserverQuotaReuqest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opMetaBatchSetInodeQuota] req: %v, resp: %v", req, err.Error())
return
}
log.LogInfof("[opMetaBatchSetInodeQuota] req [%v] decode req.", req)
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opMetaBatchSetInodeQuota] req: %v, resp: %v", req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
resp := &proto.BatchSetMetaserverQuotaResponse{}
err = mp.batchSetInodeQuota(req, resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return
}
var reply []byte
if reply, err = json.Marshal(resp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.respondToClient(conn, p)
return
}
p.PacketOkWithBody(reply)
_ = m.respondToClient(conn, p)
log.LogInfof("[opMetaBatchSetInodeQuota] req [%v] resp [%v] success.", req, resp)
return
}
func (m *metadataManager) opMetaBatchDeleteInodeQuota(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.BatchDeleteMetaserverQuotaReuqest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opMetaBatchDeleteInodeQuota] req: %v, resp: %v", req, err.Error())
return
}
log.LogInfof("[opMetaBatchDeleteInodeQuota] req [%v] decode req.", req)
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opMetaBatchDeleteInodeQuota] req: %v, resp: %v", req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
resp := &proto.BatchDeleteMetaserverQuotaResponse{}
err = mp.batchDeleteInodeQuota(req, resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
return
}
var reply []byte
if reply, err = json.Marshal(resp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.respondToClient(conn, p)
return
}
p.PacketOkWithBody(reply)
_ = m.respondToClient(conn, p)
log.LogInfof("[opMetaBatchDeleteInodeQuota] req [%v] resp [%v] success.", req, resp)
return err
}
func (m *metadataManager) opMetaGetInodeQuota(conn net.Conn, p *Packet, remote string) (err error) {
req := &proto.GetInodeQuotaRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.getInodeQuota(req.Inode, p)
_ = m.respondToClient(conn, p)
log.LogInfof("[opMetaGetInodeQuota] get inode[%v] quota success.", req.Inode)
return
}
func (m *metadataManager) opMetaGetUniqID(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
req := &proto.GetUniqIDRequest{}
if err = json.Unmarshal(p.Data, req); err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
mp, err := m.getPartition(req.PartitionID)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
m.respondToClient(conn, p)
err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
return
}
if !m.serveProxy(conn, mp, p) {
return
}
err = mp.GetUniqID(p, req.Num)
m.respondToClient(conn, p)
if err != nil {
log.LogErrorf("%s [opMetaGetUniqID] %s, "+
"response to client: %s", remoteAddr, err.Error(), p.GetResultMsg())
}
log.LogDebugf("%s [opMetaGetUniqID] req: %d - %v, resp: %v, body: %s",
remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
return
}
func (m *metadataManager) prepareCreateVersion(req *proto.MultiVersionOpRequest) (err error, opAagin bool) {
var ver2Phase *verOp2Phase
if value, ok := m.volUpdating.Load(req.VolumeID); ok {
ver2Phase = value.(*verOp2Phase)
if req.VerSeq < ver2Phase.verSeq {
err = fmt.Errorf("seq [%v] create less than loal %v", req.VerSeq, ver2Phase.verSeq)
return
} else if req.VerSeq == ver2Phase.verPrepare {
if ver2Phase.status == proto.VersionWorking {
opAagin = true
return
}
}
}
ver2Phase = &verOp2Phase{}
ver2Phase.step = uint32(req.Op)
ver2Phase.status = proto.VersionWorking
ver2Phase.verPrepare = req.VerSeq
m.volUpdating.Store(req.VolumeID, ver2Phase)
log.LogWarnf("action[prepareCreateVersion] volume %v update to ver [%v] step %v",
req.VolumeID, req.VerSeq, ver2Phase.step)
return
}
func (m *metadataManager) checkVolVerList() (err error) {
volumeArr := make(map[string]bool)
log.LogDebugf("checkVolVerList start")
m.Range(true, func(id uint64, partition MetaPartition) bool {
volumeArr[partition.GetVolName()] = true
return true
})
for volName := range volumeArr {
mpsVerlist := make(map[uint64]*proto.VolVersionInfoList)
// need get first or else the mp verlist may be change in the follower process
m.Range(true, func(id uint64, partition MetaPartition) bool {
if partition.GetVolName() != volName {
return true
}
log.LogDebugf("action[checkVolVerList] volumeName %v id[%v] dp verlist %v partition.GetBaseConfig().PartitionId %v",
volName, id, partition.GetVerList(), partition.GetBaseConfig().PartitionId)
mpsVerlist[id] = &proto.VolVersionInfoList{VerList: partition.GetVerList()}
return true
})
var info *proto.VolVersionInfoList
if info, err = masterClient.AdminAPI().GetVerList(volName); err != nil {
log.LogErrorf("action[checkVolVerList] volumeName %v err %v", volName, err)
return
}
log.LogDebugf("action[checkVolVerList] volumeName %v info %v", volName, info)
m.Range(true, func(id uint64, partition MetaPartition) bool {
if partition.GetVolName() != volName {
return true
}
log.LogDebugf("action[checkVolVerList] volumeName %v info %v id[%v] ", volName, info, id)
if _, exist := mpsVerlist[id]; exist {
if err = partition.checkByMasterVerlist(mpsVerlist[id], info); err != nil {
return true
}
}
if _, err = partition.checkVerList(info, false); err != nil {
log.LogErrorf("[checkVolVerList] volumeName %v err %v", volName, err)
}
return true
})
}
return
}
func (m *metadataManager) commitCreateVersion(VolumeID string, VerSeq uint64, Op uint8, synchronize bool) (err error) {
log.LogWarnf("action[commitCreateVersion] volume %v seq [%v]", VolumeID, VerSeq)
var wg sync.WaitGroup
// wg.Add(len(m.partitions))
resultCh := make(chan error, len(m.partitions))
m.Range(true, func(id uint64, partition MetaPartition) bool {
if partition.GetVolName() != VolumeID {
return true
}
if _, ok := partition.IsLeader(); !ok {
return true
}
wg.Add(1)
go func(mpId uint64, mp MetaPartition) {
defer wg.Done()
log.LogInfof("action[commitCreateVersion] volume %v mp %v do HandleVersionOp verseq [%v]", VolumeID, mpId, VerSeq)
if err := mp.HandleVersionOp(Op, VerSeq, nil, synchronize); err != nil {
log.LogErrorf("action[commitCreateVersion] volume %v mp %v do HandleVersionOp verseq [%v] err %v", VolumeID, mpId, VerSeq, err)
resultCh <- err
return
}
}(id, partition)
return true
})
wg.Wait()
select {
case err = <-resultCh:
if err != nil {
close(resultCh)
return
}
default:
log.LogInfof("action[commitCreateVersion] volume %v do HandleVersionOp verseq [%v] finished", VolumeID, VerSeq)
}
close(resultCh)
if Op == proto.DeleteVersion {
return
}
if Op == proto.CreateVersionPrepare {
return
}
if value, ok := m.volUpdating.Load(VolumeID); ok {
ver2Phase := value.(*verOp2Phase)
log.LogWarnf("action[commitCreateVersion] try commit volume %v prepare seq [%v] with commit seq [%v]",
VolumeID, ver2Phase.verPrepare, VerSeq)
if VerSeq < ver2Phase.verSeq {
err = fmt.Errorf("volname [%v] seq [%v] create less than loal %v", VolumeID, VerSeq, ver2Phase.verSeq)
log.LogErrorf("action[commitCreateVersion] err %v", err)
return
}
if ver2Phase.step != proto.CreateVersionPrepare {
err = fmt.Errorf("volname [%v] step not prepare", VolumeID)
log.LogErrorf("action[commitCreateVersion] err %v", err)
return
}
ver2Phase.verSeq = VerSeq
ver2Phase.step = proto.CreateVersionCommit
ver2Phase.status = proto.VersionWorkingFinished
log.LogWarnf("action[commitCreateVersion] commit volume %v prepare seq [%v] with commit seq [%v]",
VolumeID, ver2Phase.verPrepare, VerSeq)
return
}
err = fmt.Errorf("volname [%v] not found", VolumeID)
log.LogErrorf("action[commitCreateVersion] err %v", err)
return
}
func (m *metadataManager) updatePackRspSeq(mp MetaPartition, p *Packet) {
if mp.GetVerSeq() > p.VerSeq {
log.LogDebugf("action[checkmultiSnap.multiVersionstatus] mp ver [%v], packet ver [%v]", mp.GetVerSeq(), p.VerSeq)
p.VerSeq = mp.GetVerSeq() // used to response to client and try update verSeq of client
p.ExtentType |= proto.VersionListFlag
p.VerList = make([]*proto.VolVersionInfo, len(mp.GetVerList()))
copy(p.VerList, mp.GetVerList())
}
return
}
func (m *metadataManager) checkMultiVersionStatus(mp MetaPartition, p *Packet) (err error) {
if (p.ExtentType&proto.MultiVersionFlag == 0) && mp.GetVerSeq() > 0 {
log.LogWarnf("action[checkmultiSnap.multiVersionstatus] volname [%v] mp ver [%v], client use old ver before snapshot", mp.GetVolName(), mp.GetVerSeq())
return fmt.Errorf("client use old ver before snapshot")
}
// meta node do not need to check verSeq as strictly as datanode,file append or modAppendWrite on meta node is invisible to other files.
// only need to guarantee the verSeq wrote on meta nodes grow up linearly on client's angle
log.LogDebugf("action[checkmultiSnap.multiVersionstatus] mp[%v] ver [%v], packet ver [%v] reqId %v", mp.GetBaseConfig().PartitionId, mp.GetVerSeq(), p.VerSeq, p.ReqID)
if mp.GetVerSeq() >= p.VerSeq {
if mp.GetVerSeq() > p.VerSeq {
log.LogDebugf("action[checkmultiSnap.multiVersionstatus] mp ver [%v], packet ver [%v]", mp.GetVerSeq(), p.VerSeq)
p.VerSeq = mp.GetVerSeq() // used to response to client and try update verSeq of client
p.ExtentType |= proto.VersionListFlag
p.VerList = make([]*proto.VolVersionInfo, len(mp.GetVerList()))
copy(p.VerList, mp.GetVerList())
}
return
}
if p.IsVersionList() {
_, err = mp.checkVerList(&proto.VolVersionInfoList{VerList: p.VerList}, true)
return
}
p.ResultCode = proto.OpAgainVerionList
// need return and tell client
err = fmt.Errorf("volname [%v] req seq [%v] but not found commit status", mp.GetVolName(), p.VerSeq)
if value, ok := m.volUpdating.Load(mp.GetVolName()); ok {
ver2Phase := value.(*verOp2Phase)
if ver2Phase.isActiveReqToMaster {
return
}
}
select {
case m.verUpdateChan <- mp.GetVolName():
default:
log.LogWarnf("channel is full, volname [%v] not be queued", mp.GetVolName())
}
return
}
func (m *metadataManager) checkAndPromoteVersion(volName string) (err error) {
log.LogInfof("action[checkmultiSnap.multiVersionstatus] volumeName %v", volName)
var info *proto.VolumeVerInfo
if value, ok := m.volUpdating.Load(volName); ok {
ver2Phase := value.(*verOp2Phase)
if atomic.LoadUint32(&ver2Phase.status) != proto.VersionWorkingAbnormal &&
atomic.LoadUint32(&ver2Phase.step) == proto.CreateVersionPrepare {
ver2Phase.Lock() // here trylock may be better after go1.18 adapted to compile
defer ver2Phase.Unlock()
// check again in case of sth already happened by other goroutine during be blocked by lock
if atomic.LoadUint32(&ver2Phase.status) == proto.VersionWorkingAbnormal ||
atomic.LoadUint32(&ver2Phase.step) != proto.CreateVersionPrepare {
log.LogWarnf("action[checkmultiSnap.multiVersionstatus] volumeName %v status [%v] step %v",
volName, atomic.LoadUint32(&ver2Phase.status), atomic.LoadUint32(&ver2Phase.step))
return
}
if info, err = masterClient.AdminAPI().GetVerInfo(volName); err != nil {
log.LogErrorf("action[checkmultiSnap.multiVersionstatus] volumeName %v status [%v] step %v err %v",
volName, atomic.LoadUint32(&ver2Phase.status), atomic.LoadUint32(&ver2Phase.step), err)
return
}
if info.VerSeqPrepare != ver2Phase.verPrepare {
atomic.StoreUint32(&ver2Phase.status, proto.VersionWorkingAbnormal)
err = fmt.Errorf("volumeName %v status [%v] step %v",
volName, atomic.LoadUint32(&ver2Phase.status), atomic.LoadUint32(&ver2Phase.step))
log.LogErrorf("action[checkmultiSnap.multiVersionstatus] err %v", err)
return
}
if info.VerPrepareStatus == proto.CreateVersionCommit {
if err = m.commitCreateVersion(volName, info.VerSeqPrepare, proto.CreateVersionCommit, false); err != nil {
log.LogErrorf("action[checkmultiSnap.multiVersionstatus] err %v", err)
return
}
}
}
} else {
log.LogErrorf("action[checkmultiSnap.multiVersionstatus] volumeName %v not found", volName)
}
return
}
func (m *metadataManager) opMultiVersionOp(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
// For ack to master
data := p.Data
m.responseAckOKToMaster(conn, p)
var (
req = &proto.MultiVersionOpRequest{}
resp = &proto.MultiVersionOpResponse{}
adminTask = &proto.AdminTask{
Request: req,
}
opAgain bool
)
log.LogDebugf("action[opMultiVersionOp] volume %v op [%v]", req.VolumeID, req.Op)
start := time.Now()
decode := json.NewDecoder(bytes.NewBuffer(data))
decode.UseNumber()
if err = decode.Decode(adminTask); err != nil {
resp.Status = proto.TaskFailed
resp.Result = err.Error()
log.LogErrorf("action[opMultiVersionOp] %v mp err %v do Decoder", req.VolumeID, err.Error())
goto end
}
resp.Status = proto.TaskSucceeds
resp.VolumeID = req.VolumeID
resp.Addr = req.Addr
resp.VerSeq = req.VerSeq
resp.Op = req.Op
if req.Op == proto.CreateVersionPrepare {
if err, opAgain = m.prepareCreateVersion(req); err != nil || opAgain {
log.LogErrorf("action[opMultiVersionOp] %v mp err %v do Decoder", req.VolumeID, err)
goto end
}
if err = m.commitCreateVersion(req.VolumeID, req.VerSeq, req.Op, true); err != nil {
log.LogErrorf("action[opMultiVersionOp] %v mp err %v do commitCreateVersion", req.VolumeID, err.Error())
goto end
}
} else if req.Op == proto.CreateVersionCommit || req.Op == proto.DeleteVersion {
if err = m.commitCreateVersion(req.VolumeID, req.VerSeq, req.Op, false); err != nil {
log.LogErrorf("action[opMultiVersionOp] %v mp err %v do commitCreateVersion", req.VolumeID, err.Error())
goto end
}
}
end:
if err != nil {
resp.Result = err.Error()
}
adminTask.Request = nil
adminTask.Response = resp
if errRsp := m.respondToMaster(adminTask); errRsp != nil {
log.LogInfof("action[opMultiVersionOp] %s pkt %s, resp success req:%v; respAdminTask: %v, resp: %v, errRsp %v err %v",
remoteAddr, p.String(), req, adminTask, resp, errRsp, err)
}
if log.EnableInfo() {
rspData, _ := json.Marshal(resp)
log.LogInfof("action[opMultiVersionOp] %s pkt %s, resp success req:%v; respAdminTask: %v, resp: %v, cost %s",
remoteAddr, p.String(), req, adminTask, string(rspData), time.Since(start).String())
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"net"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
ForceClosedConnect = true
NoClosedConnect = false
)
var ErrForbiddenMetaPartition = errors.New("meta partition is forbidden")
func (m *metadataManager) IsForbiddenOp(mp MetaPartition, reqOp uint8) bool {
if !mp.IsForbidden() {
return false
}
switch reqOp {
case
// dentry
proto.OpMetaCreateDentry,
proto.OpMetaTxCreateDentry,
proto.OpQuotaCreateDentry,
proto.OpMetaDeleteDentry,
proto.OpMetaTxDeleteDentry,
proto.OpMetaBatchDeleteDentry,
proto.OpMetaUpdateDentry,
proto.OpMetaTxUpdateDentry,
// extend
proto.OpMetaUpdateXAttr,
proto.OpMetaSetXAttr,
proto.OpMetaBatchSetXAttr,
proto.OpMetaRemoveXAttr,
// extent
proto.OpMetaTruncate,
proto.OpMetaExtentsAdd,
proto.OpMetaExtentAddWithCheck,
proto.OpMetaObjExtentAdd,
proto.OpMetaBatchObjExtentsAdd,
proto.OpMetaBatchExtentsAdd,
proto.OpMetaExtentsDel,
// inode
proto.OpMetaCreateInode,
proto.OpQuotaCreateInode,
proto.OpMetaTxUnlinkInode,
proto.OpMetaUnlinkInode,
proto.OpMetaBatchUnlinkInode,
proto.OpMetaTxLinkInode,
proto.OpMetaLinkInode,
proto.OpMetaEvictInode,
proto.OpMetaBatchEvictInode,
proto.OpMetaSetattr,
proto.OpMetaBatchDeleteInode,
proto.OpMetaClearInodeCache,
proto.OpMetaTxCreateInode,
// multipart
proto.OpAddMultipartPart,
proto.OpRemoveMultipart,
proto.OpCreateMultipart,
// quota
proto.OpMetaBatchSetInodeQuota,
proto.OpMetaBatchDeleteInodeQuota:
return true
default:
return false
}
}
// The proxy is used during the leader change. When a leader of a partition changes, the proxy forwards the request to
// the new leader.
func (m *metadataManager) serveProxy(conn net.Conn, mp MetaPartition,
p *Packet) (ok bool) {
var (
mConn *net.TCPConn
leaderAddr string
err error
reqID = p.ReqID
reqOp = p.Opcode
)
// check forbidden
if m.IsForbiddenOp(mp, reqOp) {
err = ErrForbiddenMetaPartition
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.respondToClient(conn, p)
return false
}
if leaderAddr, ok = mp.IsLeader(); ok {
return
}
if leaderAddr == "" {
err = ErrNoLeader
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
goto end
}
mConn, err = m.connPool.GetConnect(leaderAddr)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.connPool.PutConnect(mConn, ForceClosedConnect)
goto end
}
// send to master connection
if err = p.WriteToConn(mConn); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.connPool.PutConnect(mConn, ForceClosedConnect)
goto end
}
// read connection from the master
if err = p.ReadFromConnWithVer(mConn, proto.NoReadDeadlineTime); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
m.connPool.PutConnect(mConn, ForceClosedConnect)
goto end
}
if reqID != p.ReqID || reqOp != p.Opcode {
log.LogErrorf("serveProxy: send and received packet mismatch: req(%v_%v) resp(%v_%v)",
reqID, reqOp, p.ReqID, p.Opcode)
}
m.connPool.PutConnect(mConn, NoClosedConnect)
end:
m.respondToClient(conn, p)
if err != nil {
log.LogErrorf("[serveProxy]: req: %d - %v, %v, packet(%v)", p.GetReqID(),
p.GetOpMsg(), err, p)
}
log.LogDebugf("[serveProxy] req: %d - %v, resp: %v, packet(%v)", p.GetReqID(), p.GetOpMsg(),
p.GetResultMsg(), p)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"net"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// Reply operation results to the master.
func (m *metadataManager) respondToMaster(task *proto.AdminTask) (err error) {
// handle panic
defer func() {
if r := recover(); r != nil {
switch data := r.(type) {
case error:
err = data
default:
err = errors.New(data.(string))
}
}
}()
if err = masterClient.NodeAPI().ResponseMetaNodeTask(task); err != nil {
err = errors.Trace(err, "try respondToMaster failed")
}
return
}
// Reply data through tcp connection to the client.
func (m *metadataManager) respondToClientWithVer(conn net.Conn, p *Packet) (err error) {
// Handle panic
defer func() {
if r := recover(); r != nil {
switch data := r.(type) {
case error:
err = data
default:
err = errors.New(data.(string))
}
}
}()
// process data and send reply though specified tcp connection.
if p.VerSeq > 0 {
p.ExtentType |= proto.MultiVersionFlag
}
err = p.WriteToConn(conn)
if err != nil {
log.LogErrorf("response to client[%s], "+
"request[%s], response packet[%s]",
err.Error(), p.GetOpMsg(), p.GetResultMsg())
}
return
}
// Reply data through tcp connection to the client.
func (m *metadataManager) respondToClient(conn net.Conn, p *Packet) (err error) {
// Handle panic
defer func() {
if r := recover(); r != nil {
switch data := r.(type) {
case error:
err = data
default:
err = errors.New(data.(string))
}
}
}()
// process data and send reply though specified tcp connection.
err = p.WriteToConn(conn)
if err != nil {
log.LogErrorf("response to client[%s], "+
"request[%s], response packet[%s]",
err.Error(), p.GetOpMsg(), p.GetResultMsg())
}
return
}
func (m *metadataManager) responseAckOKToMaster(conn net.Conn, p *Packet) {
go func() {
p.PacketOkReply()
if err := p.WriteToConn(conn); err != nil {
log.LogErrorf("ack master response: %s", err.Error())
}
}()
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"sync"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
type MetaQuotaManager struct {
statisticTemp *sync.Map // key quotaId, value proto.QuotaUsedInfo
statisticBase *sync.Map // key quotaId, value proto.QuotaUsedInfo
statisticRebuildTemp *sync.Map // key quotaId, value proto.QuotaUsedInfo
statisticRebuildBase *sync.Map // key quotaId, value proto.QuotaUsedInfo
limitedMap *sync.Map
rbuilding bool
volName string
rwlock sync.RWMutex
mpID uint64
enable bool
}
type MetaQuotaInode struct {
inode *Inode
quotaIds []uint32
}
type TxMetaQuotaInode struct {
txinode *TxInode
quotaIds []uint32
}
func NewQuotaManager(volName string, mpId uint64) (mqMgr *MetaQuotaManager) {
mqMgr = &MetaQuotaManager{
statisticTemp: new(sync.Map),
statisticBase: new(sync.Map),
statisticRebuildTemp: new(sync.Map),
statisticRebuildBase: new(sync.Map),
limitedMap: new(sync.Map),
volName: volName,
mpID: mpId,
}
return
}
func (qInode *MetaQuotaInode) Marshal() (result []byte, err error) {
var inodeBytes []byte
quotaBytes := bytes.NewBuffer(make([]byte, 0, 128))
buff := bytes.NewBuffer(make([]byte, 0, 128))
inodeBytes, err = qInode.inode.Marshal()
if err != nil {
return
}
inodeLen := uint32(len(inodeBytes))
if err = binary.Write(buff, binary.BigEndian, inodeLen); err != nil {
return
}
buff.Write(inodeBytes)
for _, quotaId := range qInode.quotaIds {
if err = binary.Write(quotaBytes, binary.BigEndian, quotaId); err != nil {
return
}
}
buff.Write(quotaBytes.Bytes())
result = buff.Bytes()
log.LogDebugf("MetaQuotaInode Marshal inode[%v] inodeLen [%v] size [%v]", qInode.inode.Inode, inodeLen, len(result))
return
}
func (qInode *MetaQuotaInode) Unmarshal(raw []byte) (err error) {
var inodeLen uint32
var quotaId uint32
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &inodeLen); err != nil {
return
}
inodeBytes := make([]byte, inodeLen)
if _, err = buff.Read(inodeBytes); err != nil {
return
}
log.LogDebugf("MetaQuotaInode Unmarshal inodeLen [%v] size [%v]", inodeBytes, len(raw))
qInode.inode = NewInode(0, 0)
if err = qInode.inode.Unmarshal(inodeBytes); err != nil {
return
}
for {
if buff.Len() == 0 {
break
}
if err = binary.Read(buff, binary.BigEndian, "aId); err != nil {
return
}
qInode.quotaIds = append(qInode.quotaIds, quotaId)
}
return
}
func (qInode *TxMetaQuotaInode) Marshal() (result []byte, err error) {
var inodeBytes []byte
quotaBytes := bytes.NewBuffer(make([]byte, 0, 128))
buff := bytes.NewBuffer(make([]byte, 0, 128))
inodeBytes, err = qInode.txinode.Marshal()
if err != nil {
return
}
inodeLen := uint32(len(inodeBytes))
if err = binary.Write(buff, binary.BigEndian, inodeLen); err != nil {
return
}
buff.Write(inodeBytes)
for _, quotaId := range qInode.quotaIds {
if err = binary.Write(quotaBytes, binary.BigEndian, quotaId); err != nil {
return
}
}
buff.Write(quotaBytes.Bytes())
result = buff.Bytes()
log.LogDebugf("TxMetaQuotaInode Marshal inode[%v] inodeLen [%v] size [%v]", qInode.txinode.Inode.Inode, inodeLen, len(result))
return
}
func (qInode *TxMetaQuotaInode) Unmarshal(raw []byte) (err error) {
var inodeLen uint32
var quotaId uint32
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &inodeLen); err != nil {
return
}
inodeBytes := make([]byte, inodeLen)
if _, err = buff.Read(inodeBytes); err != nil {
return
}
log.LogDebugf("TxMetaQuotaInode Unmarshal inodeLen [%v] size [%v]", inodeBytes, len(raw))
qInode.txinode = NewTxInode(0, 0, nil)
if err = qInode.txinode.Unmarshal(inodeBytes); err != nil {
return
}
for {
if buff.Len() == 0 {
break
}
if err = binary.Read(buff, binary.BigEndian, "aId); err != nil {
return
}
qInode.quotaIds = append(qInode.quotaIds, quotaId)
}
return
}
func (mqMgr *MetaQuotaManager) setQuotaHbInfo(infos []*proto.QuotaHeartBeatInfo) {
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
for _, info := range infos {
if mqMgr.volName != info.VolName {
continue
}
mqMgr.enable = info.Enable
mqMgr.limitedMap.Store(info.QuotaId, info.LimitedInfo)
log.LogDebugf("mp[%v] quotaId [%v] limitedInfo [%v]", mqMgr.mpID, info.QuotaId, info.LimitedInfo)
}
mqMgr.limitedMap.Range(func(key, value interface{}) bool {
quotaId := key.(uint32)
found := false
for _, info := range infos {
if mqMgr.volName != info.VolName {
continue
}
if info.QuotaId == quotaId {
found = true
break
}
}
if !found {
mqMgr.limitedMap.Delete(quotaId)
}
return true
})
return
}
func (mqMgr *MetaQuotaManager) getQuotaReportInfos() (infos []*proto.QuotaReportInfo) {
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
var usedInfo proto.QuotaUsedInfo
mqMgr.statisticTemp.Range(func(key, value interface{}) bool {
usedInfo = value.(proto.QuotaUsedInfo)
if value, isFind := mqMgr.statisticBase.Load(key.(uint32)); isFind {
baseInfo := value.(proto.QuotaUsedInfo)
log.LogDebugf("[getQuotaReportInfos] statisticTemp mp[%v] key [%v] usedInfo [%v] baseInfo [%v]", mqMgr.mpID,
key.(uint32), usedInfo, baseInfo)
usedInfo.Add(&baseInfo)
if usedInfo.UsedFiles < 0 {
log.LogWarnf("[getQuotaReportInfos] statisticTemp mp[%v] key [%v] usedInfo [%v]", mqMgr.mpID, key.(uint32), usedInfo)
usedInfo.UsedFiles = 0
}
if usedInfo.UsedBytes < 0 {
log.LogWarnf("[getQuotaReportInfos] statisticTemp mp[%v] key [%v] usedInfo [%v]", mqMgr.mpID, key.(uint32), usedInfo)
usedInfo.UsedBytes = 0
}
}
mqMgr.statisticBase.Store(key.(uint32), usedInfo)
return true
})
mqMgr.statisticTemp = new(sync.Map)
mqMgr.statisticBase.Range(func(key, value interface{}) bool {
quotaId := key.(uint32)
if _, ok := mqMgr.limitedMap.Load(quotaId); !ok {
return true
}
usedInfo = value.(proto.QuotaUsedInfo)
reportInfo := &proto.QuotaReportInfo{
QuotaId: quotaId,
UsedInfo: usedInfo,
}
infos = append(infos, reportInfo)
log.LogDebugf("[getQuotaReportInfos] statisticBase mp[%v] key [%v] usedInfo [%v]", mqMgr.mpID, key.(uint32), usedInfo)
return true
})
return
}
func (mqMgr *MetaQuotaManager) statisticRebuildStart() bool {
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
if !mqMgr.enable {
return false
}
if mqMgr.rbuilding {
return false
}
mqMgr.rbuilding = true
return true
}
func (mqMgr *MetaQuotaManager) statisticRebuildFin(rebuild bool) {
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
mqMgr.rbuilding = false
if !rebuild {
mqMgr.statisticRebuildBase = new(sync.Map)
mqMgr.statisticRebuildTemp = new(sync.Map)
return
}
mqMgr.statisticBase = mqMgr.statisticRebuildBase
mqMgr.statisticTemp = mqMgr.statisticRebuildTemp
mqMgr.statisticRebuildBase = new(sync.Map)
mqMgr.statisticRebuildTemp = new(sync.Map)
if log.EnableInfo() {
mqMgr.statisticTemp.Range(func(key, value interface{}) bool {
quotaId := key.(uint32)
usedInfo := value.(proto.QuotaUsedInfo)
log.LogInfof("statisticRebuildFin statisticTemp mp[%v] quotaId [%v] usedInfo [%v]", mqMgr.mpID, quotaId, usedInfo)
return true
})
mqMgr.statisticBase.Range(func(key, value interface{}) bool {
quotaId := key.(uint32)
usedInfo := value.(proto.QuotaUsedInfo)
log.LogInfof("statisticRebuildFin statisticBase mp[%v] quotaId [%v] usedInfo [%v]", mqMgr.mpID, quotaId, usedInfo)
return true
})
}
}
func (mqMgr *MetaQuotaManager) IsOverQuota(size bool, files bool, quotaId uint32) (status uint8) {
var limitedInfo proto.QuotaLimitedInfo
mqMgr.rwlock.RLock()
defer mqMgr.rwlock.RUnlock()
if !mqMgr.enable {
log.LogInfof("IsOverQuota quota [%v] is disable.", quotaId)
return
}
value, isFind := mqMgr.limitedMap.Load(quotaId)
if isFind {
limitedInfo = value.(proto.QuotaLimitedInfo)
if size && limitedInfo.LimitedBytes {
status = proto.OpNoSpaceErr
}
if files && limitedInfo.LimitedFiles {
status = proto.OpNoSpaceErr
}
}
log.LogInfof("IsOverQuota quotaId [%v] limitedInfo[%v] status [%v] isFind [%v]", quotaId, limitedInfo, status, isFind)
return
}
func (mqMgr *MetaQuotaManager) updateUsedInfo(size int64, files int64, quotaId uint32) {
var baseInfo proto.QuotaUsedInfo
var baseTemp proto.QuotaUsedInfo
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
value, isFind := mqMgr.statisticTemp.Load(quotaId)
if isFind {
baseInfo = value.(proto.QuotaUsedInfo)
}
baseInfo.UsedBytes += size
baseInfo.UsedFiles += files
mqMgr.statisticTemp.Store(quotaId, baseInfo)
if mqMgr.rbuilding {
value, isFind = mqMgr.statisticRebuildTemp.Load(quotaId)
if isFind {
baseTemp = value.(proto.QuotaUsedInfo)
} else {
baseTemp.UsedBytes = 0
baseTemp.UsedFiles = 0
}
baseTemp.UsedBytes += size
baseTemp.UsedFiles += files
mqMgr.statisticRebuildTemp.Store(quotaId, baseTemp)
}
log.LogDebugf("updateUsedInfo mpId [%v] quotaId [%v] baseInfo [%v] baseTemp[%v]", mqMgr.mpID, quotaId, baseInfo, baseTemp)
return
}
func (mqMgr *MetaQuotaManager) EnableQuota() bool {
return mqMgr.enable
}
func (mqMgr *MetaQuotaManager) getUsedInfoForTest(quotaId uint32) (size int64, files int64) {
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
var baseInfo proto.QuotaUsedInfo
value, isFind := mqMgr.statisticTemp.Load(quotaId)
if isFind {
baseInfo = value.(proto.QuotaUsedInfo)
}
return baseInfo.UsedBytes, baseInfo.UsedFiles
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"fmt"
syslog "log"
"os"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/xtaci/smux"
"github.com/cubefs/cubefs/cmd/common"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
masterSDK "github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
var (
clusterInfo *proto.ClusterInfo
// masterClient *masterSDK.MasterClient
masterClient *masterSDK.MasterCLientWithResolver
configTotalMem uint64
serverPort string
smuxPortShift int
smuxPool *util.SmuxConnectPool
smuxPoolCfg = util.DefaultSmuxConnPoolConfig()
)
// The MetaNode manages the dentry and inode information of the meta partitions on a meta node.
// The data consistency is ensured by Raft.
type MetaNode struct {
nodeId uint64
listen string
bindIp bool
metadataDir string // root dir of the metaNode
raftDir string // root dir of the raftStore log
metadataManager MetadataManager
localAddr string
clusterId string
raftStore raftstore.RaftStore
raftHeartbeatPort string
raftReplicatePort string
raftRetainLogs uint64
raftSyncSnapFormatVersion uint32 // format version of snapshot that raft leader sent to follower
zoneName string
httpStopC chan uint8
smuxStopC chan uint8
metrics *MetaNodeMetrics
tickInterval int
raftRecvBufSize int
connectionCnt int64
clusterUuid string
clusterUuidEnable bool
serviceIDKey string
control common.Control
}
// Start starts up the meta node with the specified configuration.
// 1. Start and load each meta partition from the snapshot.
// 2. Restore raftStore fsm of each meta node range.
// 3. Start server and accept connection from the master and clients.
func (m *MetaNode) Start(cfg *config.Config) (err error) {
return m.control.Start(m, cfg, doStart)
}
// Shutdown stops the meta node.
func (m *MetaNode) Shutdown() {
m.control.Shutdown(m, doShutdown)
}
func (m *MetaNode) checkLocalPartitionMatchWithMaster() (err error) {
var metaNodeInfo *proto.MetaNodeInfo
for i := 0; i < 3; i++ {
if metaNodeInfo, err = masterClient.NodeAPI().GetMetaNode(fmt.Sprintf("%s:%s", m.localAddr, m.listen)); err != nil {
log.LogErrorf("checkLocalPartitionMatchWithMaster: get MetaNode info fail: err(%v)", err)
continue
}
break
}
if err != nil {
return
}
if len(metaNodeInfo.PersistenceMetaPartitions) == 0 {
return
}
lackPartitions := make([]uint64, 0)
for _, partitionID := range metaNodeInfo.PersistenceMetaPartitions {
_, err := m.metadataManager.GetPartition(partitionID)
if err != nil {
lackPartitions = append(lackPartitions, partitionID)
}
}
if len(lackPartitions) == 0 {
return
}
m.metrics.MetricMetaFailedPartition.SetWithLabels(float64(1), map[string]string{
"partids": fmt.Sprintf("%v", lackPartitions),
"node": m.localAddr + ":" + m.listen,
"nodeid": fmt.Sprintf("%d", m.nodeId),
})
log.LogErrorf("LackPartitions %v on metanode %v, please deal quickly", lackPartitions, m.localAddr+":"+m.listen)
return
}
func doStart(s common.Server, cfg *config.Config) (err error) {
m, ok := s.(*MetaNode)
if !ok {
return errors.New("Invalid node Type!")
}
if err = m.parseConfig(cfg); err != nil {
return
}
if err = m.register(); err != nil {
return
}
if err = m.startRaftServer(cfg); err != nil {
return
}
if err = m.newMetaManager(); err != nil {
return
}
if err = m.startServer(); err != nil {
return
}
if err = m.startSmuxServer(); err != nil {
return
}
if err = m.startMetaManager(); err != nil {
return
}
if err = m.registerAPIHandler(); err != nil {
return
}
go m.startUpdateNodeInfo()
exporter.Init(cfg.GetString("role"), cfg)
m.startStat()
// check local partition compare with master ,if lack,then not start
if err = m.checkLocalPartitionMatchWithMaster(); err != nil {
syslog.Println(err)
exporter.Warning(err.Error())
return
}
exporter.RegistConsul(m.clusterId, cfg.GetString("role"), cfg)
return
}
func doShutdown(s common.Server) {
m, ok := s.(*MetaNode)
if !ok {
return
}
m.stopUpdateNodeInfo()
// shutdown node and release the resource
m.stopStat()
m.stopServer()
m.stopSmuxServer()
m.stopMetaManager()
m.stopRaftServer()
masterClient.Stop()
}
// Sync blocks the invoker's goroutine until the meta node shuts down.
func (m *MetaNode) Sync() {
m.control.Sync()
}
func (m *MetaNode) parseConfig(cfg *config.Config) (err error) {
if cfg == nil {
err = errors.New("invalid configuration")
return
}
m.localAddr = cfg.GetString(cfgLocalIP)
m.listen = cfg.GetString(proto.ListenPort)
m.bindIp = cfg.GetBool(proto.BindIpKey)
serverPort = m.listen
m.metadataDir = cfg.GetString(cfgMetadataDir)
m.raftDir = cfg.GetString(cfgRaftDir)
m.raftHeartbeatPort = cfg.GetString(cfgRaftHeartbeatPort)
m.raftReplicatePort = cfg.GetString(cfgRaftReplicaPort)
m.tickInterval = int(cfg.GetFloat(cfgTickInterval))
m.raftRecvBufSize = int(cfg.GetInt(cfgRaftRecvBufSize))
m.zoneName = cfg.GetString(cfgZoneName)
deleteBatchCount := cfg.GetInt64(cfgDeleteBatchCount)
if deleteBatchCount > 1 {
updateDeleteBatchCount(uint64(deleteBatchCount))
}
m.serviceIDKey = cfg.GetString(cfgServiceIDKey)
total, _, err := util.GetMemInfo()
if err != nil {
log.LogErrorf("get total mem failed, err %s", err.Error())
}
ratioStr := cfg.GetString(cfgMemRatio)
if err == nil && ratioStr != "" {
ratio, _ := strconv.Atoi(ratioStr)
if ratio <= 0 || ratio >= 100 {
return fmt.Errorf("cfgMemRatio is not legal, shoule beteen 1-100, now %s", ratioStr)
}
configTotalMem = total * uint64(ratio) / 100
log.LogInfof("configTotalMem by ratio is: mem [%d], ratio[%d]", configTotalMem, ratio)
} else {
configTotalMem, _ = strconv.ParseUint(cfg.GetString(cfgTotalMem), 10, 64)
if configTotalMem == 0 {
return fmt.Errorf("bad totalMem config,Recommended to be configured as 80 percent of physical machine memory")
}
}
if err == nil && configTotalMem > total-util.GB {
return fmt.Errorf("bad totalMem config,Recommended to be configured as 80 percent of physical machine memory")
}
if m.metadataDir == "" {
return fmt.Errorf("bad metadataDir config")
}
if m.listen == "" {
return fmt.Errorf("bad listen config")
}
if m.raftDir == "" {
return fmt.Errorf("bad raftDir config")
}
if m.raftHeartbeatPort == "" {
return fmt.Errorf("bad raftHeartbeatPort config")
}
if m.raftReplicatePort == "" {
return fmt.Errorf("bad cfgRaftReplicaPort config")
}
raftRetainLogs := cfg.GetString(cfgRetainLogs)
if raftRetainLogs != "" {
if m.raftRetainLogs, err = strconv.ParseUint(raftRetainLogs, 10, 64); err != nil {
return fmt.Errorf("%v, err:%v", proto.ErrInvalidCfg, err.Error())
}
}
if m.raftRetainLogs <= 0 {
m.raftRetainLogs = DefaultRaftNumOfLogsToRetain
}
syslog.Println("conf raftRetainLogs=", m.raftRetainLogs)
log.LogInfof("[parseConfig] raftRetainLogs[%v]", m.raftRetainLogs)
if cfg.HasKey(cfgRaftSyncSnapFormatVersion) {
raftSyncSnapFormatVersion := uint32(cfg.GetInt64(cfgRaftSyncSnapFormatVersion))
if raftSyncSnapFormatVersion < 0 || raftSyncSnapFormatVersion > SnapFormatVersion_1 {
m.raftSyncSnapFormatVersion = SnapFormatVersion_1
log.LogInfof("invalid config raftSyncSnapFormatVersion, using default[%v]", m.raftSyncSnapFormatVersion)
} else {
m.raftSyncSnapFormatVersion = raftSyncSnapFormatVersion
log.LogInfof("by config raftSyncSnapFormatVersion:[%v]", m.raftSyncSnapFormatVersion)
}
} else {
m.raftSyncSnapFormatVersion = SnapFormatVersion_1
log.LogInfof("using default raftSyncSnapFormatVersion[%v]", m.raftSyncSnapFormatVersion)
}
syslog.Println("conf raftSyncSnapFormatVersion=", m.raftSyncSnapFormatVersion)
log.LogInfof("[parseConfig] raftSyncSnapFormatVersion[%v]", m.raftSyncSnapFormatVersion)
constCfg := config.ConstConfig{
Listen: m.listen,
RaftHeartbetPort: m.raftHeartbeatPort,
RaftReplicaPort: m.raftReplicatePort,
}
ok := false
if ok, err = config.CheckOrStoreConstCfg(m.metadataDir, config.DefaultConstConfigFile, &constCfg); !ok {
log.LogErrorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
return fmt.Errorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
}
log.LogInfof("[parseConfig] load localAddr[%v].", m.localAddr)
log.LogInfof("[parseConfig] load listen[%v].", m.listen)
log.LogInfof("[parseConfig] load metadataDir[%v].", m.metadataDir)
log.LogInfof("[parseConfig] load raftDir[%v].", m.raftDir)
log.LogInfof("[parseConfig] load raftHeartbeatPort[%v].", m.raftHeartbeatPort)
log.LogInfof("[parseConfig] load raftReplicatePort[%v].", m.raftReplicatePort)
log.LogInfof("[parseConfig] load zoneName[%v].", m.zoneName)
if err = m.parseSmuxConfig(cfg); err != nil {
return fmt.Errorf("parseSmuxConfig fail err %v", err)
} else {
log.LogInfof("Start: init smux conn pool (%v).", smuxPoolCfg)
smuxPool = util.NewSmuxConnectPool(smuxPoolCfg)
}
addrs := cfg.GetSlice(proto.MasterAddr)
masters := make([]string, 0, len(addrs))
for _, addr := range addrs {
masters = append(masters, addr.(string))
}
updateInterval := cfg.GetInt(configNameResolveInterval)
if updateInterval <= 0 || updateInterval > 60 {
log.LogWarnf("name resolving interval[1-60] is set to default: %v", DefaultNameResolveInterval)
updateInterval = DefaultNameResolveInterval
}
// masterClient = masterSDK.NewMasterClient(masters, false)
masterClient = masterSDK.NewMasterCLientWithResolver(masters, false, updateInterval)
if masterClient == nil {
err = fmt.Errorf("parseConfig: masters addrs format err[%v]", masters)
log.LogErrorf("parseConfig: masters addrs format err[%v]", masters)
return err
}
if err = masterClient.Start(); err != nil {
return err
}
err = m.validConfig()
return
}
func (m *MetaNode) parseSmuxConfig(cfg *config.Config) error {
// SMux port
smuxPortShift = int(cfg.GetInt64(cfgSmuxPortShift))
if smuxPortShift == 0 {
smuxPortShift = util.DefaultSmuxPortShift
}
// SMux buffer
maxBuffer := cfg.GetInt64(cfgSmuxMaxBuffer)
if maxBuffer > 0 {
smuxPoolCfg.MaxReceiveBuffer = int(maxBuffer)
if smuxPoolCfg.MaxStreamBuffer > int(maxBuffer) {
smuxPoolCfg.MaxStreamBuffer = int(maxBuffer)
}
if err := smux.VerifyConfig(smuxPoolCfg.Config); err != nil {
return err
}
}
maxConn := cfg.GetInt64(cfgSmuxMaxConn)
if maxConn > 0 {
smuxPoolCfg.ConnsPerAddr = int(maxConn)
}
maxStreamPerConn := cfg.GetInt64(cfgSmuxStreamPerConn)
if maxStreamPerConn > 0 {
smuxPoolCfg.StreamsPerConn = int(maxStreamPerConn)
}
if err := util.VerifySmuxPoolConfig(smuxPoolCfg); err != nil {
return err
}
log.LogDebugf("[parseSmuxConfig] cfg %v.", smuxPoolCfg)
return nil
}
func (m *MetaNode) validConfig() (err error) {
if len(strings.TrimSpace(m.listen)) == 0 {
err = errors.New("illegal listen")
return
}
if m.metadataDir == "" {
m.metadataDir = defaultMetadataDir
}
if m.raftDir == "" {
m.raftDir = defaultRaftDir
}
if len(masterClient.Nodes()) == 0 {
err = errors.New("master address list is empty")
return
}
return
}
func (m *MetaNode) newMetaManager() (err error) {
if _, err = os.Stat(m.metadataDir); err != nil {
if err = os.MkdirAll(m.metadataDir, 0o755); err != nil {
return
}
}
if m.clusterUuidEnable {
if err = config.CheckOrStoreClusterUuid(m.metadataDir, m.clusterUuid, false); err != nil {
log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err)
}
}
constCfg := config.ConstConfig{
Listen: m.listen,
RaftHeartbetPort: m.raftHeartbeatPort,
RaftReplicaPort: m.raftReplicatePort,
}
ok := false
if ok, err = config.CheckOrStoreConstCfg(m.metadataDir, config.DefaultConstConfigFile, &constCfg); !ok {
log.LogErrorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
return fmt.Errorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
}
// load metadataManager
conf := MetadataManagerConfig{
NodeID: m.nodeId,
RootDir: m.metadataDir,
RaftStore: m.raftStore,
ZoneName: m.zoneName,
}
m.metadataManager = NewMetadataManager(conf, m)
return
}
func (m *MetaNode) startMetaManager() (err error) {
if err = m.metadataManager.Start(); err == nil {
log.LogInfof("[startMetaManager] manager start finish.")
}
return
}
func (m *MetaNode) stopMetaManager() {
if m.metadataManager != nil {
m.metadataManager.Stop()
}
}
func (m *MetaNode) register() (err error) {
step := 0
var nodeAddress string
for {
if step < 1 {
clusterInfo, err = getClusterInfo()
if err != nil {
log.LogErrorf("[register] %s", err.Error())
continue
}
if m.localAddr == "" {
m.localAddr = clusterInfo.Ip
}
m.clusterUuid = clusterInfo.ClusterUuid
m.clusterUuidEnable = clusterInfo.ClusterUuidEnable
m.clusterId = clusterInfo.Cluster
nodeAddress = m.localAddr + ":" + m.listen
step++
}
var nodeID uint64
if nodeID, err = masterClient.NodeAPI().AddMetaNodeWithAuthNode(nodeAddress, m.zoneName, m.serviceIDKey); err != nil {
log.LogErrorf("register: register to master fail: address(%v) err(%s)", nodeAddress, err)
time.Sleep(3 * time.Second)
continue
}
m.nodeId = nodeID
return
}
}
// NewServer creates a new meta node instance.
func NewServer() *MetaNode {
return &MetaNode{}
}
func getClusterInfo() (ci *proto.ClusterInfo, err error) {
ci, err = masterClient.AdminAPI().GetClusterInfo()
return
}
// AddConnection adds a connection.
func (m *MetaNode) AddConnection() {
atomic.AddInt64(&m.connectionCnt, 1)
}
// RemoveConnection removes a connection.
func (m *MetaNode) RemoveConnection() {
atomic.AddInt64(&m.connectionCnt, -1)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"fmt"
"time"
"github.com/cubefs/cubefs/util/exporter"
)
// metrics
const (
StatPeriod = time.Minute * time.Duration(1)
MetricMetaFailedPartition = "meta_failed_partition"
MetricMetaPartitionInodeCount = "mpInodeCount"
MetricMetaPartitionDentryCount = "mpDentryCount"
MetricConnectionCount = "connectionCnt"
)
type MetaNodeMetrics struct {
MetricConnectionCount *exporter.Gauge
MetricMetaFailedPartition *exporter.Gauge
MetricMetaPartitionInodeCount *exporter.Gauge
MetricMetaPartitionDentryCount *exporter.Gauge
metricStopCh chan struct{}
}
func (m *MetaNode) startStat() {
m.metrics = &MetaNodeMetrics{
metricStopCh: make(chan struct{}, 0),
MetricConnectionCount: exporter.NewGauge(MetricConnectionCount),
MetricMetaFailedPartition: exporter.NewGauge(MetricMetaFailedPartition),
MetricMetaPartitionInodeCount: exporter.NewGauge(MetricMetaPartitionInodeCount),
MetricMetaPartitionDentryCount: exporter.NewGauge(MetricMetaPartitionDentryCount),
}
go m.collectPartitionMetrics()
}
func (m *MetaNode) upatePartitionMetrics(mp *metaPartition) {
labels := map[string]string{
"partid": fmt.Sprintf("%d", mp.config.PartitionId),
exporter.Vol: mp.config.VolName,
}
m.metrics.MetricMetaPartitionInodeCount.SetWithLabels(float64(mp.GetInodeTreeLen()), labels)
m.metrics.MetricMetaPartitionDentryCount.SetWithLabels(float64(mp.GetDentryTreeLen()), labels)
}
func (m *MetaNode) collectPartitionMetrics() {
ticker := time.NewTicker(StatPeriod)
for {
select {
case <-m.metrics.metricStopCh:
return
case <-ticker.C:
if manager, ok := m.metadataManager.(*metadataManager); ok {
manager.mu.RLock()
for _, p := range manager.partitions {
if mp, ok := p.(*metaPartition); ok {
m.upatePartitionMetrics(mp)
}
}
manager.mu.RUnlock()
}
m.metrics.MetricConnectionCount.Set(float64(m.connectionCnt))
}
}
}
func (m *MetaNode) stopStat() {
m.metrics.metricStopCh <- struct{}{}
}
// Code generated by MockGen. DO NOT EDIT.
// Source: raftstore/partition.go
// Package raftstoremock is a generated GoMock package.
package raftstoremock
import (
reflect "reflect"
proto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
raftstore "github.com/cubefs/cubefs/raftstore"
gomock "github.com/golang/mock/gomock"
)
// MockPartition is a mock of Partition interface.
type MockPartition struct {
ctrl *gomock.Controller
recorder *MockPartitionMockRecorder
}
// MockPartitionMockRecorder is the mock recorder for MockPartition.
type MockPartitionMockRecorder struct {
mock *MockPartition
}
// NewMockPartition creates a new mock instance.
func NewMockPartition(ctrl *gomock.Controller) *MockPartition {
mock := &MockPartition{ctrl: ctrl}
mock.recorder = &MockPartitionMockRecorder{mock}
return mock
}
// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockPartition) EXPECT() *MockPartitionMockRecorder {
return m.recorder
}
// AppliedIndex mocks base method.
func (m *MockPartition) AppliedIndex() uint64 {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "AppliedIndex")
ret0, _ := ret[0].(uint64)
return ret0
}
// AppliedIndex indicates an expected call of AppliedIndex.
func (mr *MockPartitionMockRecorder) AppliedIndex() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AppliedIndex", reflect.TypeOf((*MockPartition)(nil).AppliedIndex))
}
// ChangeMember mocks base method.
func (m *MockPartition) ChangeMember(changeType proto.ConfChangeType, peer proto.Peer, context []byte) (interface{}, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "ChangeMember", changeType, peer, context)
ret0, _ := ret[0].(interface{})
ret1, _ := ret[1].(error)
return ret0, ret1
}
// ChangeMember indicates an expected call of ChangeMember.
func (mr *MockPartitionMockRecorder) ChangeMember(changeType, peer, context interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ChangeMember", reflect.TypeOf((*MockPartition)(nil).ChangeMember), changeType, peer, context)
}
// CommittedIndex mocks base method.
func (m *MockPartition) CommittedIndex() uint64 {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "CommittedIndex")
ret0, _ := ret[0].(uint64)
return ret0
}
// CommittedIndex indicates an expected call of CommittedIndex.
func (mr *MockPartitionMockRecorder) CommittedIndex() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CommittedIndex", reflect.TypeOf((*MockPartition)(nil).CommittedIndex))
}
// IsRestoring mocks base method.
func (m *MockPartition) IsRestoring() bool {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "IsRestoring")
ret0, _ := ret[0].(bool)
return ret0
}
// IsRestoring indicates an expected call of IsRestoring.
func (mr *MockPartitionMockRecorder) IsRestoring() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsRestoring", reflect.TypeOf((*MockPartition)(nil).IsRestoring))
}
// Delete mocks base method.
func (m *MockPartition) Delete() error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Delete")
ret0, _ := ret[0].(error)
return ret0
}
// Delete indicates an expected call of Delete.
func (mr *MockPartitionMockRecorder) Delete() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Delete", reflect.TypeOf((*MockPartition)(nil).Delete))
}
// IsOfflinePeer mocks base method.
func (m *MockPartition) IsOfflinePeer() bool {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "IsOfflinePeer")
ret0, _ := ret[0].(bool)
return ret0
}
// IsOfflinePeer indicates an expected call of IsOfflinePeer.
func (mr *MockPartitionMockRecorder) IsOfflinePeer() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsOfflinePeer", reflect.TypeOf((*MockPartition)(nil).IsOfflinePeer))
}
// IsRaftLeader mocks base method.
func (m *MockPartition) IsRaftLeader() bool {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "IsRaftLeader")
ret0, _ := ret[0].(bool)
return ret0
}
// IsRaftLeader indicates an expected call of IsRaftLeader.
func (mr *MockPartitionMockRecorder) IsRaftLeader() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsRaftLeader", reflect.TypeOf((*MockPartition)(nil).IsRaftLeader))
}
// LeaderTerm mocks base method.
func (m *MockPartition) LeaderTerm() (uint64, uint64) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "LeaderTerm")
ret0, _ := ret[0].(uint64)
ret1, _ := ret[1].(uint64)
return ret0, ret1
}
// LeaderTerm indicates an expected call of LeaderTerm.
func (mr *MockPartitionMockRecorder) LeaderTerm() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LeaderTerm", reflect.TypeOf((*MockPartition)(nil).LeaderTerm))
}
// Status mocks base method.
func (m *MockPartition) Status() *raftstore.PartitionStatus {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Status")
ret0, _ := ret[0].(*raftstore.PartitionStatus)
return ret0
}
// Status indicates an expected call of Status.
func (mr *MockPartitionMockRecorder) Status() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Status", reflect.TypeOf((*MockPartition)(nil).Status))
}
// Stop mocks base method.
func (m *MockPartition) Stop() error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Stop")
ret0, _ := ret[0].(error)
return ret0
}
// Stop indicates an expected call of Stop.
func (mr *MockPartitionMockRecorder) Stop() *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockPartition)(nil).Stop))
}
// Submit mocks base method.
func (m *MockPartition) Submit(cmd []byte) (interface{}, error) {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "Submit", cmd)
ret0, _ := ret[0].(interface{})
ret1, _ := ret[1].(error)
return ret0, ret1
}
// Submit indicates an expected call of Submit.
func (mr *MockPartitionMockRecorder) Submit(cmd interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Submit", reflect.TypeOf((*MockPartition)(nil).Submit), cmd)
}
// Truncate mocks base method.
func (m *MockPartition) Truncate(index uint64) {
m.ctrl.T.Helper()
m.ctrl.Call(m, "Truncate", index)
}
// Truncate indicates an expected call of Truncate.
func (mr *MockPartitionMockRecorder) Truncate(index interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Truncate", reflect.TypeOf((*MockPartition)(nil).Truncate), index)
}
// TryToLeader mocks base method.
func (m *MockPartition) TryToLeader(nodeID uint64) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "TryToLeader", nodeID)
ret0, _ := ret[0].(error)
return ret0
}
// TryToLeader indicates an expected call of TryToLeader.
func (mr *MockPartitionMockRecorder) TryToLeader(nodeID interface{}) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryToLeader", reflect.TypeOf((*MockPartition)(nil).TryToLeader), nodeID)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"sort"
"sync"
"time"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/log"
)
// Part defined necessary fields for multipart part management.
type Part struct {
ID uint16
UploadTime time.Time
MD5 string
Size uint64
Inode uint64
}
func (m *Part) Equal(o *Part) bool {
return m.ID == o.ID &&
m.Inode == o.Inode &&
m.Size == o.Size &&
m.MD5 == o.MD5
}
func (m Part) Bytes() ([]byte, error) {
var err error
buffer := bytes.NewBuffer(nil)
tmp := make([]byte, binary.MaxVarintLen64)
var n int
// ID
n = binary.PutUvarint(tmp, uint64(m.ID))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// upload time
n = binary.PutVarint(tmp, m.UploadTime.UnixNano())
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// MD5
n = binary.PutUvarint(tmp, uint64(len(m.MD5)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
if _, err = buffer.WriteString(m.MD5); err != nil {
return nil, err
}
// size
n = binary.PutUvarint(tmp, m.Size)
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// inode
n = binary.PutUvarint(tmp, m.Inode)
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
return buffer.Bytes(), nil
}
func PartFromBytes(raw []byte) *Part {
var offset, n int
// decode ID
var u64ID uint64
u64ID, n = binary.Uvarint(raw)
offset += n
// decode upload time
var uploadTimeI64 int64
uploadTimeI64, n = binary.Varint(raw[offset:])
offset += n
// decode MD5
var md5Len uint64
md5Len, n = binary.Uvarint(raw[offset:])
offset += n
md5Content := string(raw[offset : offset+int(md5Len)])
offset += int(md5Len)
// decode size
var sizeU64 uint64
sizeU64, n = binary.Uvarint(raw[offset:])
offset += n
// decode inode
var inode uint64
inode, n = binary.Uvarint(raw[offset:])
muPart := &Part{
ID: uint16(u64ID),
UploadTime: time.Unix(0, uploadTimeI64),
MD5: md5Content,
Size: sizeU64,
Inode: inode,
}
return muPart
}
type Parts []*Part
func (m Parts) Len() int {
return len(m)
}
func (m Parts) sort() {
sort.SliceStable(m, func(i, j int) bool {
return m[i].ID < m[j].ID
})
}
func (m *Parts) Hash(part *Part) (has bool) {
i := sort.Search(len(*m), func(i int) bool {
return (*m)[i].ID >= part.ID
})
has = i < len(*m) && (*m)[i].ID == part.ID
return
}
func (m *Parts) UpdateOrStore(part *Part) (oldInode uint64, update, conflict bool) {
i := sort.Search(len(*m), func(i int) bool {
return (*m)[i].ID >= part.ID
})
if i >= 0 && i < len(*m) && (*m)[i].ID == part.ID {
oldPart := (*m)[i]
oldInode = oldPart.Inode
if part.Inode == oldInode {
log.LogWarnf("Request already success,the same partinode[%d] must not be overwritten.", oldInode)
return
}
if part.UploadTime.Before(oldPart.UploadTime) {
log.LogWarnf("Request part putTime[%v] is less than old part putTime[%v], partNumber[%v]",
part.UploadTime.UnixNano(), oldPart.UploadTime.UnixNano(), part.ID)
conflict = true
return
}
update = true
(*m)[i] = part
return
}
*m = append(*m, part)
update = false
m.sort()
return
}
// Deprecated
func (m *Parts) Insert(part *Part, replace bool) (success bool) {
i := sort.Search(len(*m), func(i int) bool {
return (*m)[i].ID >= part.ID
})
if i < len(*m) && (*m)[i].ID == part.ID {
if replace {
(*m)[i] = part
return true
}
return false
}
*m = append(*m, part)
m.sort()
return true
}
func (m *Parts) Remove(id uint16) {
i := sort.Search(len(*m), func(i int) bool {
return (*m)[i].ID >= id
})
if i < len(*m) && (*m)[i].ID == id {
if len(*m) > i+1 {
*m = append((*m)[:i], (*m)[i+1:]...)
} else {
*m = (*m)[:i]
}
}
}
func (m Parts) Search(id uint16) (part *Part, found bool) {
i := sort.Search(len(m), func(i int) bool {
return m[i].ID >= id
})
if i < len(m) && m[i].ID == id {
return m[i], true
}
return nil, false
}
func (m Parts) Bytes() ([]byte, error) {
var err error
var n int
buffer := bytes.NewBuffer(nil)
tmp := make([]byte, binary.MaxVarintLen64)
n = binary.PutUvarint(tmp, uint64(len(m)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
var marshaled []byte
for _, p := range m {
marshaled, err = p.Bytes()
if err != nil {
return nil, err
}
// write part length
n = binary.PutUvarint(tmp, uint64(len(marshaled)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// write part bytes
if _, err = buffer.Write(marshaled); err != nil {
return nil, err
}
}
return buffer.Bytes(), nil
}
func PartsFromBytes(raw []byte) Parts {
var offset, n int
var numPartsU64 uint64
numPartsU64, n = binary.Uvarint(raw)
offset += n
muParts := make([]*Part, int(numPartsU64))
for i := 0; i < int(numPartsU64); i++ {
var partLengthU64 uint64
partLengthU64, n = binary.Uvarint(raw[offset:])
offset += n
part := PartFromBytes(raw[offset : offset+int(partLengthU64)])
muParts[i] = part
offset += int(partLengthU64)
}
return muParts
}
type MultipartExtend map[string]string
func NewMultipartExtend() MultipartExtend {
return make(map[string]string)
}
func (me MultipartExtend) Bytes() ([]byte, error) {
var n int
var err error
buffer := bytes.NewBuffer(nil)
tmp := make([]byte, binary.MaxVarintLen64)
n = binary.PutUvarint(tmp, uint64(len(me)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
marshalStr := func(src string) error {
n = binary.PutUvarint(tmp, uint64(len(src)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return err
}
if _, err = buffer.WriteString(src); err != nil {
return err
}
return nil
}
for key, val := range me {
if err = marshalStr(key); err != nil {
return nil, err
}
if err = marshalStr(val); err != nil {
return nil, err
}
}
return buffer.Bytes(), nil
}
func MultipartExtendFromBytes(raw []byte) MultipartExtend {
var offset, n int
var el uint64
me := NewMultipartExtend()
unmarshalStr := func(data []byte) (string, int) {
var n int
var lengthU64 uint64
lengthU64, n = binary.Uvarint(data)
return string(data[n : n+int(lengthU64)]), n + int(lengthU64)
}
el, n = binary.Uvarint(raw)
if el <= 0 {
return nil
}
offset += n
for i := 0; i < int(el); i++ {
var key, val string
key, n = unmarshalStr(raw[offset:])
offset += n
val, n = unmarshalStr(raw[offset:])
offset += n
me[key] = val
}
return me
}
// Multipart defined necessary fields for multipart session management.
type Multipart struct {
// session fields
id string
key string
initTime time.Time
parts Parts
extend MultipartExtend
mu sync.RWMutex
}
func (m *Multipart) Less(than btree.Item) bool {
tm, is := than.(*Multipart)
return is && ((m.key < tm.key) || ((m.key == tm.key) && (m.id < tm.id)))
}
func (m *Multipart) Copy() btree.Item {
return &Multipart{
id: m.id,
key: m.key,
initTime: m.initTime,
parts: append(Parts{}, m.parts...),
extend: m.extend,
}
}
func (m *Multipart) ID() string {
return m.id
}
func (m *Multipart) UpdateOrStorePart(part *Part) (oldInode uint64, updated, conflict bool) {
m.mu.Lock()
defer m.mu.Unlock()
if m.parts == nil {
m.parts = PartsFromBytes(nil)
}
oldInode, updated, conflict = m.parts.UpdateOrStore(part)
return
}
// Deprecated
func (m *Multipart) InsertPart(part *Part, replace bool) (success bool) {
m.mu.Lock()
defer m.mu.Unlock()
if m.parts == nil {
m.parts = PartsFromBytes(nil)
}
success = m.parts.Insert(part, replace)
return
}
func (m *Multipart) Parts() []*Part {
m.mu.RLock()
defer m.mu.RUnlock()
return append([]*Part{}, m.parts...)
}
func (m *Multipart) Bytes() ([]byte, error) {
var n int
buffer := bytes.NewBuffer(nil)
var err error
tmp := make([]byte, binary.MaxVarintLen64)
// marshal id
marshalStr := func(src string) error {
n = binary.PutUvarint(tmp, uint64(len(src)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return err
}
if _, err = buffer.WriteString(src); err != nil {
return err
}
return nil
}
// marshal id
if err = marshalStr(m.id); err != nil {
return nil, err
}
// marshal key
if err = marshalStr(m.key); err != nil {
return nil, err
}
// marshal init time
n = binary.PutVarint(tmp, m.initTime.UnixNano())
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
// marshal parts
var marshaledParts []byte
if marshaledParts, err = m.parts.Bytes(); err != nil {
return nil, err
}
n = binary.PutUvarint(tmp, uint64(len(marshaledParts)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
if _, err = buffer.Write(marshaledParts); err != nil {
return nil, err
}
// marshall extend
var extendBytes []byte
if extendBytes, err = m.extend.Bytes(); err != nil {
return nil, err
}
n = binary.PutUvarint(tmp, uint64(len(extendBytes)))
if _, err = buffer.Write(tmp[:n]); err != nil {
return nil, err
}
if _, err = buffer.Write(extendBytes); err != nil {
return nil, err
}
return buffer.Bytes(), nil
}
func MultipartFromBytes(raw []byte) *Multipart {
unmarshalStr := func(data []byte) (string, int) {
var n int
var lengthU64 uint64
lengthU64, n = binary.Uvarint(data)
return string(data[n : n+int(lengthU64)]), n + int(lengthU64)
}
var offset, n int
// decode id
var id string
id, n = unmarshalStr(raw)
offset += n
// decode key
var key string
key, n = unmarshalStr(raw[offset:])
offset += n
// decode init time
var initTimeI64 int64
initTimeI64, n = binary.Varint(raw[offset:])
offset += n
// decode parts
var partsLengthU64 uint64
partsLengthU64, n = binary.Uvarint(raw[offset:])
offset += n
parts := PartsFromBytes(raw[offset : offset+int(partsLengthU64)])
offset += int(partsLengthU64)
// decode multipart extend
var extendLengthU64 uint64
extendLengthU64, n = binary.Uvarint(raw[offset:])
offset += n
me := MultipartExtendFromBytes(raw[offset : offset+int(extendLengthU64)])
muSession := &Multipart{
id: id,
key: key,
initTime: time.Unix(0, initTimeI64),
parts: parts,
extend: me,
}
return muSession
}
package metanode
import (
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
const (
UpdateNodeInfoTicket = 1 * time.Minute
DefaultDeleteBatchCounts = 128
)
type NodeInfo struct {
deleteBatchCount uint64
}
var (
nodeInfo = &NodeInfo{}
nodeInfoStopC = make(chan struct{}, 0)
deleteWorkerSleepMs uint64 = 0
dirChildrenNumLimit uint32 = proto.DefaultDirChildrenNumLimit
)
func DeleteBatchCount() uint64 {
val := atomic.LoadUint64(&nodeInfo.deleteBatchCount)
if val == 0 {
val = DefaultDeleteBatchCounts
}
return val
}
func updateDeleteBatchCount(val uint64) {
atomic.StoreUint64(&nodeInfo.deleteBatchCount, val)
}
func updateDeleteWorkerSleepMs(val uint64) {
atomic.StoreUint64(&deleteWorkerSleepMs, val)
}
func updateDirChildrenNumLimit(val uint32) {
atomic.StoreUint32(&dirChildrenNumLimit, val)
}
func DeleteWorkerSleepMs() {
val := atomic.LoadUint64(&deleteWorkerSleepMs)
if val > 0 {
time.Sleep(time.Duration(val) * time.Millisecond)
}
}
func (m *MetaNode) startUpdateNodeInfo() {
ticker := time.NewTicker(UpdateNodeInfoTicket)
defer ticker.Stop()
for {
select {
case <-nodeInfoStopC:
log.LogInfo("metanode nodeinfo gorutine stopped")
return
case <-ticker.C:
m.updateNodeInfo()
m.metadataManager.checkVolVerList()
}
}
}
func (m *MetaNode) stopUpdateNodeInfo() {
nodeInfoStopC <- struct{}{}
}
func (m *MetaNode) updateNodeInfo() {
// clusterInfo, err := getClusterInfo()
clusterInfo, err := masterClient.AdminAPI().GetClusterInfo()
if err != nil {
log.LogErrorf("[updateNodeInfo] %s", err.Error())
return
}
updateDeleteBatchCount(clusterInfo.MetaNodeDeleteBatchCount)
updateDeleteWorkerSleepMs(clusterInfo.MetaNodeDeleteWorkerSleepMs)
if clusterInfo.DirChildrenNumLimit < proto.MinDirChildrenNumLimit {
log.LogWarnf("updateNodeInfo: DirChildrenNumLimit probably not enabled on master, set to default value(%v)",
proto.DefaultDirChildrenNumLimit)
atomic.StoreUint32(&dirChildrenNumLimit, proto.DefaultDirChildrenNumLimit)
} else {
atomic.StoreUint32(&dirChildrenNumLimit, clusterInfo.DirChildrenNumLimit)
log.LogInfof("updateNodeInfo: DirChildrenNumLimit(%v)", clusterInfo.DirChildrenNumLimit)
}
// updateDirChildrenNumLimit(clusterInfo.DirChildrenNumLimit)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
type Packet struct {
proto.Packet
}
// NewPacketToDeleteExtent returns a new packet to delete the extent.
func NewPacketToDeleteExtent(dp *DataPartition, ext *proto.ExtentKey) (p *Packet, invalid bool) {
p = new(Packet)
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpMarkDelete
p.ExtentType = proto.NormalExtentType
p.PartitionID = dp.PartitionID
if storage.IsTinyExtent(ext.ExtentId) {
p.ExtentType = proto.TinyExtentType
}
log.LogDebugf("NewPacketToDeleteExtent. ext %v", ext)
if ext.IsSplit() {
var (
newOff = ext.ExtentOffset
newSize = ext.Size
)
if int(ext.ExtentOffset)%util.PageSize != 0 {
log.LogDebugf("NewPacketToDeleteExtent. ext %v", ext)
newOff = ext.ExtentOffset + util.PageSize - ext.ExtentOffset%util.PageSize
if ext.Size <= uint32(newOff-ext.ExtentOffset) {
invalid = true
log.LogDebugf("NewPacketToDeleteExtent. ext %v invalid to punch hole newOff %v",
ext, newOff)
return
}
newSize = ext.Size - uint32(newOff-ext.ExtentOffset)
}
if newSize%util.PageSize != 0 {
newSize = newSize - newSize%util.PageSize
}
if newSize == 0 {
invalid = true
log.LogDebugf("NewPacketToDeleteExtent. ext %v invalid to punch hole", ext)
return
}
ext.Size = newSize
ext.ExtentOffset = newOff
log.LogDebugf("ext [%v] delete be set split flag", ext)
p.Opcode = proto.OpSplitMarkDelete
} else {
log.LogDebugf("ext [%v] delete normal ext", ext)
}
p.Data, _ = json.Marshal(ext)
p.Size = uint32(len(p.Data))
p.ExtentID = ext.ExtentId
p.ReqID = proto.GenerateRequestID()
p.RemainingFollowers = uint8(len(dp.Hosts) - 1)
if len(dp.Hosts) == 1 {
p.RemainingFollowers = 127
}
p.Arg = ([]byte)(dp.GetAllAddrs())
p.ArgLen = uint32(len(p.Arg))
return
}
// NewPacketToBatchDeleteExtent returns a new packet to batch delete the extent.
func NewPacketToBatchDeleteExtent(dp *DataPartition, exts []*proto.ExtentKey) *Packet {
p := new(Packet)
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpBatchDeleteExtent
p.ExtentType = proto.NormalExtentType
p.PartitionID = uint64(dp.PartitionID)
p.Data, _ = json.Marshal(exts)
p.Size = uint32(len(p.Data))
p.ReqID = proto.GenerateRequestID()
p.RemainingFollowers = uint8(len(dp.Hosts) - 1)
if len(dp.Hosts) == 1 {
p.RemainingFollowers = 127
}
p.Arg = ([]byte)(dp.GetAllAddrs())
p.ArgLen = uint32(len(p.Arg))
return p
}
// NewPacketToDeleteExtent returns a new packet to delete the extent.
func NewPacketToFreeInodeOnRaftFollower(partitionID uint64, freeInodes []byte) *Packet {
p := new(Packet)
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpMetaFreeInodesOnRaftFollower
p.PartitionID = partitionID
p.ExtentType = proto.NormalExtentType
p.ReqID = proto.GenerateRequestID()
p.Data = make([]byte, len(freeInodes))
copy(p.Data, freeInodes)
p.Size = uint32(len(p.Data))
return p
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/json"
"fmt"
"math"
"math/rand"
"os"
"path"
"reflect"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/blobstore/api/access"
"github.com/cubefs/cubefs/cmd/common"
raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/sdk/data/blobstore"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/timeutil"
)
// NOTE: if the operation is invoked by local machine
// the remote addr is "127.0.0.1"
const localAddrForAudit = "127.0.0.1"
var (
ErrIllegalHeartbeatAddress = errors.New("illegal heartbeat address")
ErrIllegalReplicateAddress = errors.New("illegal replicate address")
ErrSnapshotCrcMismatch = errors.New("snapshot crc not match")
)
// Errors
var (
ErrInodeIDOutOfRange = errors.New("inode ID out of range")
)
type sortedPeers []proto.Peer
func (sp sortedPeers) Len() int {
return len(sp)
}
func (sp sortedPeers) Less(i, j int) bool {
return sp[i].ID < sp[j].ID
}
func (sp sortedPeers) Swap(i, j int) {
sp[i], sp[j] = sp[j], sp[i]
}
// MetaMultiSnapshotInfo
type MetaMultiSnapshotInfo struct {
VerSeq uint64
Status int8
Ctime time.Time
}
// MetaPartitionConfig is used to create a meta partition.
type MetaPartitionConfig struct {
// Identity for raftStore group. RaftStore nodes in the same raftStore group must have the same groupID.
PartitionId uint64 `json:"partition_id"`
VolName string `json:"vol_name"`
Start uint64 `json:"start"` // Minimal Inode ID of this range. (Required during initialization)
End uint64 `json:"end"` // Maximal Inode ID of this range. (Required during initialization)
PartitionType int `json:"partition_type"`
Peers []proto.Peer `json:"peers"` // Peers information of the raftStore
Cursor uint64 `json:"-"` // Cursor ID of the inode that have been assigned
UniqId uint64 `json:"-"`
NodeId uint64 `json:"-"`
RootDir string `json:"-"`
VerSeq uint64 `json:"ver_seq"`
BeforeStart func() `json:"-"`
AfterStart func() `json:"-"`
BeforeStop func() `json:"-"`
AfterStop func() `json:"-"`
RaftStore raftstore.RaftStore `json:"-"`
ConnPool *util.ConnectPool `json:"-"`
Forbidden bool `json:"-"`
}
func (c *MetaPartitionConfig) checkMeta() (err error) {
if c.PartitionId <= 0 {
err = errors.NewErrorf("[checkMeta]: partition id at least 1, "+
"now partition id is: %d", c.PartitionId)
return
}
if c.Start < 0 {
err = errors.NewErrorf("[checkMeta]: start at least 0")
return
}
if c.End <= c.Start {
err = errors.NewErrorf("[checkMeta]: end=%v, "+
"start=%v; end <= start", c.End, c.Start)
return
}
if len(c.Peers) <= 0 {
err = errors.NewErrorf("[checkMeta]: must have peers, now peers is 0")
return
}
return
}
func (c *MetaPartitionConfig) sortPeers() {
sp := sortedPeers(c.Peers)
sort.Sort(sp)
}
// OpInode defines the interface for the inode operations.
type OpInode interface {
CreateInode(req *CreateInoReq, p *Packet, remoteAddr string) (err error)
UnlinkInode(req *UnlinkInoReq, p *Packet, remoteAddr string) (err error)
UnlinkInodeBatch(req *BatchUnlinkInoReq, p *Packet, remoteAddr string) (err error)
InodeGet(req *InodeGetReq, p *Packet) (err error)
InodeGetSplitEk(req *InodeGetSplitReq, p *Packet) (err error)
InodeGetBatch(req *InodeGetReqBatch, p *Packet) (err error)
CreateInodeLink(req *LinkInodeReq, p *Packet, remoteAddr string) (err error)
EvictInode(req *EvictInodeReq, p *Packet, remoteAddr string) (err error)
EvictInodeBatch(req *BatchEvictInodeReq, p *Packet, remoteAddr string) (err error)
SetAttr(req *SetattrRequest, reqData []byte, p *Packet) (err error)
GetInodeTree() *BTree
GetInodeTreeLen() int
DeleteInode(req *proto.DeleteInodeRequest, p *Packet, remoteAddr string) (err error)
DeleteInodeBatch(req *proto.DeleteInodeBatchRequest, p *Packet, remoteAddr string) (err error)
ClearInodeCache(req *proto.ClearInodeCacheRequest, p *Packet) (err error)
TxCreateInode(req *proto.TxCreateInodeRequest, p *Packet, remoteAddr string) (err error)
TxUnlinkInode(req *proto.TxUnlinkInodeRequest, p *Packet, remoteAddr string) (err error)
TxCreateInodeLink(req *proto.TxLinkInodeRequest, p *Packet, remoteAddr string) (err error)
QuotaCreateInode(req *proto.QuotaCreateInodeRequest, p *Packet, remoteAddr string) (err error)
}
type OpExtend interface {
SetXAttr(req *proto.SetXAttrRequest, p *Packet) (err error)
BatchSetXAttr(req *proto.BatchSetXAttrRequest, p *Packet) (err error)
GetXAttr(req *proto.GetXAttrRequest, p *Packet) (err error)
GetAllXAttr(req *proto.GetAllXAttrRequest, p *Packet) (err error)
BatchGetXAttr(req *proto.BatchGetXAttrRequest, p *Packet) (err error)
RemoveXAttr(req *proto.RemoveXAttrRequest, p *Packet) (err error)
ListXAttr(req *proto.ListXAttrRequest, p *Packet) (err error)
UpdateXAttr(req *proto.UpdateXAttrRequest, p *Packet) (err error)
}
// OpDentry defines the interface for the dentry operations.
type OpDentry interface {
CreateDentry(req *CreateDentryReq, p *Packet, remoteAddr string) (err error)
DeleteDentry(req *DeleteDentryReq, p *Packet, remoteAddr string) (err error)
DeleteDentryBatch(req *BatchDeleteDentryReq, p *Packet, remoteAddr string) (err error)
UpdateDentry(req *UpdateDentryReq, p *Packet, remoteAddr string) (err error)
ReadDir(req *ReadDirReq, p *Packet) (err error)
ReadDirLimit(req *ReadDirLimitReq, p *Packet) (err error)
ReadDirOnly(req *ReadDirOnlyReq, p *Packet) (err error)
Lookup(req *LookupReq, p *Packet) (err error)
GetDentryTree() *BTree
GetDentryTreeLen() int
TxCreateDentry(req *proto.TxCreateDentryRequest, p *Packet, remoteAddr string) (err error)
TxDeleteDentry(req *proto.TxDeleteDentryRequest, p *Packet, remoteAddr string) (err error)
TxUpdateDentry(req *proto.TxUpdateDentryRequest, p *Packet, remoteAddr string) (err error)
QuotaCreateDentry(req *proto.QuotaCreateDentryRequest, p *Packet, remoteAddr string) (err error)
}
type OpTransaction interface {
TxCreate(req *proto.TxCreateRequest, p *Packet) (err error)
TxCommitRM(req *proto.TxApplyRMRequest, p *Packet) error
TxRollbackRM(req *proto.TxApplyRMRequest, p *Packet) error
TxCommit(req *proto.TxApplyRequest, p *Packet, remoteAddr string) (err error)
TxRollback(req *proto.TxApplyRequest, p *Packet, remoteAddr string) (err error)
TxGetInfo(req *proto.TxGetInfoRequest, p *Packet) (err error)
TxGetCnt() (uint64, uint64, uint64)
TxGetTree() (*BTree, *BTree, *BTree)
}
// OpExtent defines the interface for the extent operations.
type OpExtent interface {
ExtentAppend(req *proto.AppendExtentKeyRequest, p *Packet) (err error)
ExtentAppendWithCheck(req *proto.AppendExtentKeyWithCheckRequest, p *Packet) (err error)
BatchObjExtentAppend(req *proto.AppendObjExtentKeysRequest, p *Packet) (err error)
ExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error)
ObjExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error)
ExtentsTruncate(req *ExtentsTruncateReq, p *Packet, remoteAddr string) (err error)
BatchExtentAppend(req *proto.AppendExtentKeysRequest, p *Packet) (err error)
// ExtentsDelete(req *proto.DelExtentKeyRequest, p *Packet) (err error)
}
type OpMultipart interface {
GetMultipart(req *proto.GetMultipartRequest, p *Packet) (err error)
CreateMultipart(req *proto.CreateMultipartRequest, p *Packet) (err error)
AppendMultipart(req *proto.AddMultipartPartRequest, p *Packet) (err error)
RemoveMultipart(req *proto.RemoveMultipartRequest, p *Packet) (err error)
ListMultipart(req *proto.ListMultipartRequest, p *Packet) (err error)
GetUidInfo() (info []*proto.UidReportSpaceInfo)
SetUidLimit(info []*proto.UidSpaceInfo)
SetTxInfo(info []*proto.TxInfo)
GetExpiredMultipart(req *proto.GetExpiredMultipartRequest, p *Packet) (err error)
}
// MultiVersion operation from master or client
type OpMultiVersion interface {
GetVerSeq() uint64
GetVerList() []*proto.VolVersionInfo
GetAllVerList() []*proto.VolVersionInfo
HandleVersionOp(op uint8, verSeq uint64, verList []*proto.VolVersionInfo, sync bool) (err error)
fsmVersionOp(reqData []byte) (err error)
GetAllVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error)
GetSpecVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error)
GetExtentByVer(ino *Inode, req *proto.GetExtentsRequest, rsp *proto.GetExtentsResponse)
checkVerList(info *proto.VolVersionInfoList, sync bool) (needUpdate bool, err error)
checkByMasterVerlist(mpVerList *proto.VolVersionInfoList, masterVerList *proto.VolVersionInfoList) (err error)
}
// OpMeta defines the interface for the metadata operations.
type OpMeta interface {
OpInode
OpDentry
OpExtent
OpPartition
OpExtend
OpMultipart
OpTransaction
OpQuota
OpMultiVersion
}
// OpPartition defines the interface for the partition operations.
type OpPartition interface {
GetVolName() (volName string)
IsLeader() (leaderAddr string, isLeader bool)
LeaderTerm() (leaderID, term uint64)
IsFollowerRead() bool
SetFollowerRead(bool)
GetCursor() uint64
GetUniqId() uint64
GetBaseConfig() MetaPartitionConfig
ResponseLoadMetaPartition(p *Packet) (err error)
PersistMetadata() (err error)
RenameStaleMetadata() (err error)
ChangeMember(changeType raftproto.ConfChangeType, peer raftproto.Peer, context []byte) (resp interface{}, err error)
Reset() (err error)
UpdatePartition(req *UpdatePartitionReq, resp *UpdatePartitionResp) (err error)
DeleteRaft() error
IsExsitPeer(peer proto.Peer) bool
TryToLeader(groupID uint64) error
CanRemoveRaftMember(peer proto.Peer) error
IsEquareCreateMetaPartitionRequst(request *proto.CreateMetaPartitionRequest) (err error)
GetUniqID(p *Packet, num uint32) (err error)
}
// MetaPartition defines the interface for the meta partition operations.
type MetaPartition interface {
Start(isCreate bool) error
Stop()
DataSize() uint64
GetFreeListLen() int
OpMeta
LoadSnapshot(path string) error
ForceSetMetaPartitionToLoadding()
ForceSetMetaPartitionToFininshLoad()
IsForbidden() bool
SetForbidden(status bool)
IsEnableAuditLog() bool
SetEnableAuditLog(status bool)
}
type UidManager struct {
accumDelta *sync.Map
accumBase *sync.Map
accumRebuildDelta *sync.Map // snapshot redoLog
accumRebuildBase *sync.Map // snapshot mirror
uidAcl *sync.Map
lastUpdateTime time.Time
enable bool
rbuilding bool
volName string
acLock sync.RWMutex
mpID uint64
}
func NewUidMgr(volName string, mpID uint64) (mgr *UidManager) {
mgr = &UidManager{
volName: volName,
mpID: mpID,
accumDelta: new(sync.Map),
accumBase: new(sync.Map),
accumRebuildDelta: new(sync.Map),
accumRebuildBase: new(sync.Map),
uidAcl: new(sync.Map),
}
var uid uint32
mgr.uidAcl.Store(uid, false)
log.LogDebugf("NewUidMgr init")
return
}
func (uMgr *UidManager) addUidSpace(uid uint32, inode uint64, eks []proto.ExtentKey) (status uint8) {
uMgr.acLock.Lock()
defer uMgr.acLock.Unlock()
status = proto.OpOk
if uMgr.getUidAcl(uid) {
log.LogWarnf("addUidSpace.volname [%v] mp[%v] uid %v be set full", uMgr.mpID, uMgr.volName, uid)
return proto.OpNoSpaceErr
}
if eks == nil {
return
}
var size int64
for _, ek := range eks {
size += int64(ek.Size)
}
if val, ok := uMgr.accumDelta.Load(uid); ok {
size += val.(int64)
}
uMgr.accumDelta.Store(uid, size)
if uMgr.rbuilding {
if val, ok := uMgr.accumRebuildDelta.Load(uid); ok {
size += val.(int64)
}
uMgr.accumRebuildDelta.Store(uid, size)
}
return
}
func (uMgr *UidManager) doMinusUidSpace(uid uint32, inode uint64, size uint64) {
uMgr.acLock.Lock()
defer uMgr.acLock.Unlock()
doWork := func(delta *sync.Map) {
var rsvSize int64
if val, ok := delta.Load(uid); ok {
delta.Store(uid, val.(int64)-int64(size))
} else {
rsvSize -= int64(size)
delta.Store(uid, rsvSize)
}
}
doWork(uMgr.accumDelta)
if uMgr.rbuilding {
doWork(uMgr.accumRebuildDelta)
}
}
func (uMgr *UidManager) minusUidSpace(uid uint32, inode uint64, eks []proto.ExtentKey) {
var size uint64
for _, ek := range eks {
size += uint64(ek.Size)
}
uMgr.doMinusUidSpace(uid, inode, size)
}
func (uMgr *UidManager) getUidAcl(uid uint32) (enable bool) {
if val, ok := uMgr.uidAcl.Load(uid); ok {
enable = val.(bool)
}
return
}
func (uMgr *UidManager) setUidAcl(info []*proto.UidSpaceInfo) {
uMgr.acLock.Lock()
defer uMgr.acLock.Unlock()
uMgr.uidAcl = new(sync.Map)
for _, uidInfo := range info {
if uidInfo.VolName != uMgr.volName {
continue
}
// log.LogDebugf("setUidAcl.volname [%v] uid %v be set enable %v", uMgr.volName, uidInfo.Uid, uidInfo.Limited)
uMgr.uidAcl.Store(uidInfo.Uid, uidInfo.Limited)
}
}
func (uMgr *UidManager) getAllUidSpace() (rsp []*proto.UidReportSpaceInfo) {
uMgr.acLock.RLock()
defer uMgr.acLock.RUnlock()
var ok bool
uMgr.accumDelta.Range(func(key, value interface{}) bool {
var size int64
size += value.(int64)
if baseInfo, ok := uMgr.accumBase.Load(key.(uint32)); ok {
size += baseInfo.(int64)
if size < 0 {
log.LogErrorf("getAllUidSpace. mp[%v] uid %v size small than 0 %v, old %v, new %v", uMgr.mpID, key.(uint32), size, value.(int64), baseInfo.(int64))
return false
}
}
uMgr.accumBase.Store(key.(uint32), size)
return true
})
uMgr.accumDelta = new(sync.Map)
uMgr.accumBase.Range(func(key, value interface{}) bool {
var size int64
if size, ok = value.(int64); !ok {
log.LogErrorf("getAllUidSpace. mp[%v] accumBase key %v size type %v", uMgr.mpID, reflect.TypeOf(key), reflect.TypeOf(value))
return false
}
rsp = append(rsp, &proto.UidReportSpaceInfo{
Uid: key.(uint32),
Size: uint64(size),
})
// log.LogDebugf("getAllUidSpace. mp[%v] accumBase uid %v size %v", uMgr.mpID, key.(uint32), size)
return true
})
return
}
func (uMgr *UidManager) accumRebuildStart() bool {
uMgr.acLock.Lock()
defer uMgr.acLock.Unlock()
log.LogDebugf("accumRebuildStart vol [%v] mp[%v] rbuilding [%v]", uMgr.volName, uMgr.mpID, uMgr.rbuilding)
if uMgr.rbuilding {
return false
}
uMgr.rbuilding = true
return true
}
func (uMgr *UidManager) accumRebuildFin(rebuild bool) {
uMgr.acLock.Lock()
defer uMgr.acLock.Unlock()
log.LogDebugf("accumRebuildFin rebuild volname [%v], mp:[%v],%v:%v, rebuild:[%v]", uMgr.volName, uMgr.mpID,
uMgr.accumRebuildBase, uMgr.accumRebuildDelta, rebuild)
uMgr.rbuilding = false
if !rebuild {
uMgr.accumRebuildBase = new(sync.Map)
uMgr.accumRebuildDelta = new(sync.Map)
return
}
uMgr.accumBase = uMgr.accumRebuildBase
uMgr.accumDelta = uMgr.accumRebuildDelta
uMgr.accumRebuildBase = new(sync.Map)
uMgr.accumRebuildDelta = new(sync.Map)
}
func (uMgr *UidManager) accumInoUidSize(ino *Inode, accum *sync.Map) {
size := ino.GetSpaceSize()
if val, ok := accum.Load(ino.Uid); ok {
size += uint64(val.(int64))
}
accum.Store(ino.Uid, int64(size))
}
type OpQuota interface {
setQuotaHbInfo(infos []*proto.QuotaHeartBeatInfo)
getQuotaReportInfos() (infos []*proto.QuotaReportInfo)
batchSetInodeQuota(req *proto.BatchSetMetaserverQuotaReuqest,
resp *proto.BatchSetMetaserverQuotaResponse) (err error)
batchDeleteInodeQuota(req *proto.BatchDeleteMetaserverQuotaReuqest,
resp *proto.BatchDeleteMetaserverQuotaResponse) (err error)
getInodeQuota(inode uint64, p *Packet) (err error)
}
// metaPartition manages the range of the inode IDs.
// When a new inode is requested, it allocates a new inode id for this inode if possible.
// States:
//
// +-----+ +-------+
// | New | → Restore → | Ready |
// +-----+ +-------+
type metaPartition struct {
config *MetaPartitionConfig
size uint64 // For partition all file size
applyID uint64 // Inode/Dentry max applyID, this index will be update after restoring from the dumped data.
storedApplyId uint64 // update after store snapshot to disk
dentryTree *BTree // btree for dentries
inodeTree *BTree // btree for inodes
extendTree *BTree // btree for inode extend (XAttr) management
multipartTree *BTree // collection for multipart management
txProcessor *TransactionProcessor // transction processor
raftPartition raftstore.Partition
stopC chan bool
storeChan chan *storeMsg
state uint32
delInodeFp *os.File
freeList *freeList // free inode list
extDelCh chan []proto.ExtentKey
extReset chan struct{}
vol *Vol
manager *metadataManager
isLoadingMetaPartition bool
summaryLock sync.Mutex
ebsClient *blobstore.BlobStoreClient
volType int
isFollowerRead bool
uidManager *UidManager
xattrLock sync.Mutex
fileRange []int64
mqMgr *MetaQuotaManager
nonIdempotent sync.Mutex
uniqChecker *uniqChecker
verSeq uint64
multiVersionList *proto.VolVersionInfoList
versionLock sync.Mutex
verUpdateChan chan []byte
enableAuditLog bool
}
func (mp *metaPartition) IsForbidden() bool {
return mp.config.Forbidden
}
func (mp *metaPartition) SetForbidden(status bool) {
mp.config.Forbidden = status
}
func (mp *metaPartition) IsEnableAuditLog() bool {
return mp.enableAuditLog
}
func (mp *metaPartition) SetEnableAuditLog(status bool) {
mp.enableAuditLog = status
}
func (mp *metaPartition) acucumRebuildStart() bool {
return mp.uidManager.accumRebuildStart()
}
func (mp *metaPartition) acucumRebuildFin(rebuild bool) {
mp.uidManager.accumRebuildFin(rebuild)
}
func (mp *metaPartition) acucumUidSizeByStore(ino *Inode) {
mp.uidManager.accumInoUidSize(ino, mp.uidManager.accumRebuildBase)
}
func (mp *metaPartition) acucumUidSizeByLoad(ino *Inode) {
mp.uidManager.accumInoUidSize(ino, mp.uidManager.accumBase)
}
func (mp *metaPartition) GetVerList() []*proto.VolVersionInfo {
mp.multiVersionList.RWLock.RLock()
defer mp.multiVersionList.RWLock.RUnlock()
verList := make([]*proto.VolVersionInfo, len(mp.multiVersionList.VerList))
copy(verList, mp.multiVersionList.VerList)
return verList
}
// include TemporaryVerMap or else cann't recycle temporary version after restart
func (mp *metaPartition) GetAllVerList() (verList []*proto.VolVersionInfo) {
mp.multiVersionList.RWLock.RLock()
defer mp.multiVersionList.RWLock.RUnlock()
verList = make([]*proto.VolVersionInfo, len(mp.multiVersionList.VerList))
copy(verList, mp.multiVersionList.VerList)
for _, verInfo := range mp.multiVersionList.TemporaryVerMap {
verList = append(verList, verInfo)
}
sort.SliceStable(verList, func(i, j int) bool {
if verList[i].Ver < verList[j].Ver {
return true
}
return false
})
return
}
func (mp *metaPartition) updateSize() {
timer := time.NewTicker(time.Minute * 2)
go func() {
for {
select {
case <-timer.C:
size := uint64(0)
mp.inodeTree.GetTree().Ascend(func(item BtreeItem) bool {
inode := item.(*Inode)
size += inode.Size
return true
})
mp.size = size
log.LogDebugf("[updateSize] update mp[%v] size(%d) success,inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, size, mp.inodeTree.Len(), mp.dentryTree.Len())
case <-mp.stopC:
log.LogDebugf("[updateSize] stop update mp[%v] size,inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.inodeTree.Len(), mp.dentryTree.Len())
return
}
}
}()
}
func (mp *metaPartition) ForceSetMetaPartitionToLoadding() {
mp.isLoadingMetaPartition = true
}
func (mp *metaPartition) ForceSetMetaPartitionToFininshLoad() {
mp.isLoadingMetaPartition = false
}
func (mp *metaPartition) DataSize() uint64 {
return mp.size
}
func (mp *metaPartition) GetFreeListLen() int {
return mp.freeList.Len()
}
// Start starts a meta partition.
func (mp *metaPartition) Start(isCreate bool) (err error) {
if atomic.CompareAndSwapUint32(&mp.state, common.StateStandby, common.StateStart) {
defer func() {
var newState uint32
if err != nil {
newState = common.StateStandby
} else {
newState = common.StateRunning
}
atomic.StoreUint32(&mp.state, newState)
}()
if mp.config.BeforeStart != nil {
mp.config.BeforeStart()
}
if err = mp.onStart(isCreate); err != nil {
err = errors.NewErrorf("[Start]->%s", err.Error())
return
}
if mp.config.AfterStart != nil {
mp.config.AfterStart()
}
}
return
}
// Stop stops a meta partition.
func (mp *metaPartition) Stop() {
if atomic.CompareAndSwapUint32(&mp.state, common.StateRunning, common.StateShutdown) {
defer atomic.StoreUint32(&mp.state, common.StateStopped)
if mp.config.BeforeStop != nil {
mp.config.BeforeStop()
}
mp.onStop()
if mp.config.AfterStop != nil {
mp.config.AfterStop()
log.LogDebugf("[AfterStop]: partition id=%d execute ok.",
mp.config.PartitionId)
}
}
}
func (mp *metaPartition) versionInit(isCreate bool) (err error) {
if !isCreate {
return
}
var verList *proto.VolVersionInfoList
verList, err = masterClient.AdminAPI().GetVerList(mp.config.VolName)
if err != nil {
log.LogErrorf("action[onStart] GetVerList err[%v]", err)
return
}
for _, info := range verList.VerList {
if info.Status != proto.VersionNormal {
continue
}
mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, info)
}
log.LogDebugf("action[onStart] mp[%v] verList %v", mp.config.PartitionId, mp.multiVersionList.VerList)
vlen := len(mp.multiVersionList.VerList)
if vlen > 0 {
mp.verSeq = mp.multiVersionList.VerList[vlen-1].Ver
}
return
}
func (mp *metaPartition) onStart(isCreate bool) (err error) {
defer func() {
if err == nil {
return
}
mp.onStop()
}()
if err = mp.versionInit(isCreate); err != nil {
return
}
if err = mp.load(isCreate); err != nil {
err = errors.NewErrorf("[onStart] load partition id=%d: %s",
mp.config.PartitionId, err.Error())
return
}
mp.startScheduleTask()
if err = mp.startFreeList(); err != nil {
err = errors.NewErrorf("[onStart] start free list id=%d: %s",
mp.config.PartitionId, err.Error())
return
}
// set EBS Client
if clusterInfo, err = masterClient.AdminAPI().GetClusterInfo(); err != nil {
log.LogErrorf("action[onStart] GetClusterInfo err[%v]", err)
return
}
var volumeInfo *proto.SimpleVolView
if volumeInfo, err = masterClient.AdminAPI().GetVolumeSimpleInfo(mp.config.VolName); err != nil {
log.LogErrorf("action[onStart] GetVolumeSimpleInfo err[%v]", err)
return
}
mp.vol.volDeleteLockTime = volumeInfo.DeleteLockTime
go mp.runVersionOp()
mp.volType = volumeInfo.VolType
var ebsClient *blobstore.BlobStoreClient
if clusterInfo.EbsAddr != "" && proto.IsCold(mp.volType) {
ebsClient, err = blobstore.NewEbsClient(
access.Config{
ConnMode: access.NoLimitConnMode,
Consul: access.ConsulConfig{
Address: clusterInfo.EbsAddr,
},
MaxSizePutOnce: int64(volumeInfo.ObjBlockSize),
Logger: &access.Logger{Filename: path.Join(log.LogDir, "ebs.log")},
},
)
if err != nil {
log.LogErrorf("action[onStart] err[%v]", err)
return
}
if ebsClient == nil {
err = errors.NewErrorf("[onStart] ebsClient is nil")
return
}
mp.ebsClient = ebsClient
}
go mp.startCheckerEvict()
log.LogDebugf("[before raft] get mp[%v] applied(%d),inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.applyID, mp.inodeTree.Len(), mp.dentryTree.Len())
if err = mp.startRaft(); err != nil {
err = errors.NewErrorf("[onStart] start raft id=%d: %s",
mp.config.PartitionId, err.Error())
return
}
log.LogDebugf("[after raft] get mp[%v] applied(%d),inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.applyID, mp.inodeTree.Len(), mp.dentryTree.Len())
mp.updateSize()
if proto.IsHot(mp.volType) {
log.LogInfof("hot vol not need cacheTTL")
go mp.multiVersionTTLWork(time.Minute)
return
}
// do cache TTL die out process
if err = mp.cacheTTLWork(); err != nil {
err = errors.NewErrorf("[onStart] start CacheTTLWork id=%d: %s",
mp.config.PartitionId, err.Error())
return
}
return
}
func (mp *metaPartition) startScheduleTask() {
mp.startSchedule(mp.applyID)
mp.startFileStats()
}
func (mp *metaPartition) onStop() {
mp.stopRaft()
mp.stop()
if mp.delInodeFp != nil {
mp.delInodeFp.Sync()
mp.delInodeFp.Close()
}
}
func (mp *metaPartition) startRaft() (err error) {
var (
heartbeatPort int
replicaPort int
peers []raftstore.PeerAddress
)
if heartbeatPort, replicaPort, err = mp.getRaftPort(); err != nil {
return
}
for _, peer := range mp.config.Peers {
addr := strings.Split(peer.Addr, ":")[0]
rp := raftstore.PeerAddress{
Peer: raftproto.Peer{
ID: peer.ID,
},
Address: addr,
HeartbeatPort: heartbeatPort,
ReplicaPort: replicaPort,
}
peers = append(peers, rp)
}
log.LogInfof("start partition id=%d,applyID:%v raft peers: %s",
mp.config.PartitionId, mp.applyID, peers)
pc := &raftstore.PartitionConfig{
ID: mp.config.PartitionId,
Applied: mp.applyID,
Peers: peers,
SM: mp,
}
mp.raftPartition, err = mp.config.RaftStore.CreatePartition(pc)
if err == nil {
mp.ForceSetMetaPartitionToFininshLoad()
}
return
}
func (mp *metaPartition) stopRaft() {
if mp.raftPartition != nil {
// TODO Unhandled errors
// mp.raftPartition.Stop()
}
return
}
func (mp *metaPartition) getRaftPort() (heartbeat, replica int, err error) {
raftConfig := mp.config.RaftStore.RaftConfig()
heartbeatAddrSplits := strings.Split(raftConfig.HeartbeatAddr, ":")
replicaAddrSplits := strings.Split(raftConfig.ReplicateAddr, ":")
if len(heartbeatAddrSplits) != 2 {
err = ErrIllegalHeartbeatAddress
return
}
if len(replicaAddrSplits) != 2 {
err = ErrIllegalReplicateAddress
return
}
heartbeat, err = strconv.Atoi(heartbeatAddrSplits[1])
if err != nil {
return
}
replica, err = strconv.Atoi(replicaAddrSplits[1])
if err != nil {
return
}
return
}
// NewMetaPartition creates a new meta partition with the specified configuration.
func NewMetaPartition(conf *MetaPartitionConfig, manager *metadataManager) MetaPartition {
mp := &metaPartition{
config: conf,
dentryTree: NewBtree(),
inodeTree: NewBtree(),
extendTree: NewBtree(),
multipartTree: NewBtree(),
stopC: make(chan bool),
storeChan: make(chan *storeMsg, 100),
freeList: newFreeList(),
extDelCh: make(chan []proto.ExtentKey, defaultDelExtentsCnt),
extReset: make(chan struct{}),
vol: NewVol(),
manager: manager,
uniqChecker: newUniqChecker(),
verSeq: conf.VerSeq,
multiVersionList: &proto.VolVersionInfoList{
TemporaryVerMap: make(map[uint64]*proto.VolVersionInfo),
},
enableAuditLog: true,
}
mp.txProcessor = NewTransactionProcessor(mp)
return mp
}
func (mp *metaPartition) GetVolName() (volName string) {
return mp.config.VolName
}
func (mp *metaPartition) GetVerSeq() uint64 {
return atomic.LoadUint64(&mp.verSeq)
}
// IsLeader returns the raft leader address and if the current meta partition is the leader.
func (mp *metaPartition) SetFollowerRead(fRead bool) {
if mp.raftPartition == nil {
return
}
mp.isFollowerRead = fRead
return
}
// IsLeader returns the raft leader address and if the current meta partition is the leader.
func (mp *metaPartition) IsFollowerRead() (ok bool) {
if mp.raftPartition == nil {
return false
}
if !mp.isFollowerRead {
return false
}
if mp.raftPartition.IsRestoring() {
return false
}
return true
}
// IsLeader returns the raft leader address and if the current meta partition is the leader.
func (mp *metaPartition) IsLeader() (leaderAddr string, ok bool) {
if mp.raftPartition == nil {
return
}
leaderID, _ := mp.raftPartition.LeaderTerm()
if leaderID == 0 {
return
}
ok = leaderID == mp.config.NodeId
for _, peer := range mp.config.Peers {
if leaderID == peer.ID {
leaderAddr = peer.Addr
return
}
}
return
}
func (mp *metaPartition) LeaderTerm() (leaderID, term uint64) {
if mp.raftPartition == nil {
return
}
return mp.raftPartition.LeaderTerm()
}
func (mp *metaPartition) GetPeers() (peers []string) {
peers = make([]string, 0)
for _, peer := range mp.config.Peers {
if mp.config.NodeId == peer.ID {
continue
}
peers = append(peers, peer.Addr)
}
return
}
// GetCursor returns the cursor stored in the config.
func (mp *metaPartition) GetCursor() uint64 {
return atomic.LoadUint64(&mp.config.Cursor)
}
// GetUniqId returns the uniqid stored in the config.
func (mp *metaPartition) GetUniqId() uint64 {
return atomic.LoadUint64(&mp.config.UniqId)
}
// PersistMetadata is the wrapper of persistMetadata.
func (mp *metaPartition) PersistMetadata() (err error) {
mp.config.sortPeers()
err = mp.persistMetadata()
return
}
// Backup partition to partition.old
func (mp *metaPartition) RenameStaleMetadata() (err error) {
err = mp.renameStaleMetadata()
return
}
func (mp *metaPartition) parseCrcFromFile() ([]uint32, error) {
data, err := os.ReadFile(path.Join(path.Join(mp.config.RootDir, snapshotDir), SnapshotSign))
if err != nil {
return nil, err
}
raw := string(data)
crcStrs := strings.Split(raw, " ")
crcs := make([]uint32, 0, len(crcStrs))
for _, crcStr := range crcStrs {
crc, err := strconv.ParseUint(crcStr, 10, 32)
if err != nil {
return nil, err
}
crcs = append(crcs, uint32(crc))
}
return crcs, nil
}
const (
CRC_COUNT_BASIC int = 4
CRC_COUNT_TX_STUFF int = 7
CRC_COUNT_UINQ_STUFF int = 8
CRC_COUNT_MULTI_VER int = 9
)
func (mp *metaPartition) LoadSnapshot(snapshotPath string) (err error) {
crcs, err := mp.parseCrcFromFile()
if err != nil {
return err
}
loadFuncs := []func(rootDir string, crc uint32) error{
mp.loadInode,
mp.loadDentry,
nil, // loading quota info from extend requires mp.loadInode() has been completed, so skip mp.loadExtend() here
mp.loadMultipart,
}
crc_count := len(crcs)
if crc_count != CRC_COUNT_BASIC && crc_count != CRC_COUNT_TX_STUFF && crc_count != CRC_COUNT_UINQ_STUFF && crc_count != CRC_COUNT_MULTI_VER {
log.LogErrorf("action[LoadSnapshot] crc array length %d not match", len(crcs))
return ErrSnapshotCrcMismatch
}
// handle compatibility in upgrade scenarios
needLoadTxStuff := false
needLoadUniqStuff := false
if crc_count >= CRC_COUNT_TX_STUFF {
needLoadTxStuff = true
loadFuncs = append(loadFuncs, mp.loadTxInfo)
loadFuncs = append(loadFuncs, mp.loadTxRbInode)
loadFuncs = append(loadFuncs, mp.loadTxRbDentry)
}
if crc_count >= CRC_COUNT_UINQ_STUFF {
needLoadUniqStuff = true
loadFuncs = append(loadFuncs, mp.loadUniqChecker)
}
if crc_count == CRC_COUNT_MULTI_VER {
if err = mp.loadMultiVer(snapshotPath, crcs[CRC_COUNT_MULTI_VER-1]); err != nil {
return
}
} else {
mp.storeMultiVersion(snapshotPath, &storeMsg{multiVerList: mp.multiVersionList.VerList})
}
errs := make([]error, len(loadFuncs))
var wg sync.WaitGroup
wg.Add(len(loadFuncs))
for idx, f := range loadFuncs {
loadFunc := f
if f == nil {
wg.Done()
continue
}
i := idx
go func() {
defer func() {
if r := recover(); r != nil {
log.LogWarnf("action[LoadSnapshot] recovered when load partition partition: %v, failed: %v",
mp.config.PartitionId, r)
errs[i] = errors.NewErrorf("%v", r)
}
wg.Done()
}()
errs[i] = loadFunc(snapshotPath, crcs[i])
}()
}
wg.Wait()
log.LogDebugf("[load meta finish] get mp[%v] inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.inodeTree.Len(), mp.dentryTree.Len())
for _, err = range errs {
if err != nil {
return
}
}
if err = mp.loadExtend(snapshotPath, crcs[2]); err != nil {
return
}
if needLoadTxStuff {
if err = mp.loadTxID(snapshotPath); err != nil {
return
}
}
if needLoadUniqStuff {
if err = mp.loadUniqID(snapshotPath); err != nil {
return
}
}
if err = mp.loadApplyID(snapshotPath); err != nil {
return
}
return
}
func (mp *metaPartition) load(isCreate bool) (err error) {
if err = mp.loadMetadata(); err != nil {
return
}
// 1. create new metaPartition, no need to load snapshot
// 2. store the snapshot files for new mp, because
// mp.load() will check all the snapshot files when mn startup
if isCreate {
if err = mp.storeSnapshotFiles(); err != nil {
err = errors.NewErrorf("[onStart] storeSnapshotFiles for partition id=%d: %s",
mp.config.PartitionId, err.Error())
}
return
}
snapshotPath := path.Join(mp.config.RootDir, snapshotDir)
if _, err = os.Stat(snapshotPath); err != nil {
log.LogErrorf("load snapshot failed, err: %s", err.Error())
return nil
}
return mp.LoadSnapshot(snapshotPath)
}
func (mp *metaPartition) store(sm *storeMsg) (err error) {
log.LogWarnf("metaPartition %d store apply %v", mp.config.PartitionId, sm.applyIndex)
tmpDir := path.Join(mp.config.RootDir, snapshotDirTmp)
if _, err = os.Stat(tmpDir); err == nil {
// TODO Unhandled errors
os.RemoveAll(tmpDir)
}
err = nil
if err = os.MkdirAll(tmpDir, 0o775); err != nil {
return
}
defer func() {
if err != nil {
// TODO Unhandled errors
os.RemoveAll(tmpDir)
}
}()
crcBuffer := bytes.NewBuffer(make([]byte, 0, 16))
storeFuncs := []func(dir string, sm *storeMsg) (uint32, error){
mp.storeInode,
mp.storeDentry,
mp.storeExtend,
mp.storeMultipart,
mp.storeTxInfo,
mp.storeTxRbInode,
mp.storeTxRbDentry,
mp.storeUniqChecker,
mp.storeMultiVersion,
}
for _, storeFunc := range storeFuncs {
var crc uint32
if crc, err = storeFunc(tmpDir, sm); err != nil {
return
}
if crcBuffer.Len() != 0 {
crcBuffer.WriteString(" ")
}
crcBuffer.WriteString(fmt.Sprintf("%d", crc))
}
log.LogWarnf("metaPartition %d store apply %v", mp.config.PartitionId, sm.applyIndex)
if err = mp.storeApplyID(tmpDir, sm); err != nil {
return
}
if err = mp.storeTxID(tmpDir, sm); err != nil {
return
}
if err = mp.storeUniqID(tmpDir, sm); err != nil {
return
}
// write crc to file
if err = os.WriteFile(path.Join(tmpDir, SnapshotSign), crcBuffer.Bytes(), 0o775); err != nil {
return
}
snapshotDir := path.Join(mp.config.RootDir, snapshotDir)
// check snapshot backup
backupDir := path.Join(mp.config.RootDir, snapshotBackup)
if _, err = os.Stat(backupDir); err == nil {
if err = os.RemoveAll(backupDir); err != nil {
return
}
}
err = nil
// rename snapshot
if _, err = os.Stat(snapshotDir); err == nil {
if err = os.Rename(snapshotDir, backupDir); err != nil {
return
}
}
err = nil
if err = os.Rename(tmpDir, snapshotDir); err != nil {
_ = os.Rename(backupDir, snapshotDir)
return
}
err = os.RemoveAll(backupDir)
if err != nil {
return
}
mp.storedApplyId = sm.applyIndex
return
}
// UpdatePeers updates the peers.
func (mp *metaPartition) UpdatePeers(peers []proto.Peer) {
mp.config.Peers = peers
}
// DeleteRaft deletes the raft partition.
func (mp *metaPartition) DeleteRaft() (err error) {
err = mp.raftPartition.Delete()
return
}
// Return a new inode ID and update the offset.
func (mp *metaPartition) nextInodeID() (inodeId uint64, err error) {
for {
cur := atomic.LoadUint64(&mp.config.Cursor)
end := mp.config.End
if cur >= end {
log.LogWarnf("nextInodeID: can't create inode again, cur %d, end %d", cur, end)
return 0, ErrInodeIDOutOfRange
}
newId := cur + 1
if atomic.CompareAndSwapUint64(&mp.config.Cursor, cur, newId) {
return newId, nil
}
}
}
// ChangeMember changes the raft member with the specified one.
func (mp *metaPartition) ChangeMember(changeType raftproto.ConfChangeType, peer raftproto.Peer, context []byte) (resp interface{}, err error) {
resp, err = mp.raftPartition.ChangeMember(changeType, peer, context)
return
}
// GetBaseConfig returns the configuration stored in the meta partition. TODO remove? no usage?
func (mp *metaPartition) GetBaseConfig() MetaPartitionConfig {
return *mp.config
}
// UpdatePartition updates the meta partition. TODO remove? no usage?
func (mp *metaPartition) UpdatePartition(req *UpdatePartitionReq,
resp *UpdatePartitionResp) (err error) {
reqData, err := json.Marshal(req)
if err != nil {
resp.Status = proto.TaskFailed
resp.Result = err.Error()
return
}
r, err := mp.submit(opFSMUpdatePartition, reqData)
if err != nil {
resp.Status = proto.TaskFailed
resp.Result = err.Error()
return
}
if status := r.(uint8); status != proto.OpOk {
resp.Status = proto.TaskFailed
p := &Packet{}
p.ResultCode = status
err = errors.NewErrorf("[UpdatePartition]: %s", p.GetResultMsg())
resp.Result = p.GetResultMsg()
}
resp.Status = proto.TaskSucceeds
return
}
func (mp *metaPartition) DecommissionPartition(req []byte) (err error) {
_, err = mp.submit(opFSMDecommissionPartition, req)
return
}
func (mp *metaPartition) IsExsitPeer(peer proto.Peer) bool {
for _, hasExsitPeer := range mp.config.Peers {
if hasExsitPeer.Addr == peer.Addr && hasExsitPeer.ID == peer.ID {
return true
}
}
return false
}
func (mp *metaPartition) TryToLeader(groupID uint64) error {
return mp.raftPartition.TryToLeader(groupID)
}
// ResponseLoadMetaPartition loads the snapshot signature. TODO remove? no usage?
func (mp *metaPartition) ResponseLoadMetaPartition(p *Packet) (err error) {
resp := &proto.MetaPartitionLoadResponse{
PartitionID: mp.config.PartitionId,
DoCompare: true,
}
resp.MaxInode = mp.GetCursor()
resp.InodeCount = uint64(mp.GetInodeTreeLen())
resp.DentryCount = uint64(mp.GetDentryTreeLen())
resp.ApplyID = mp.getApplyID()
resp.CommittedID = mp.getCommittedID()
if err != nil {
err = errors.Trace(err,
"[ResponseLoadMetaPartition] check snapshot")
return
}
data, err := json.Marshal(resp)
if err != nil {
err = errors.Trace(err, "[ResponseLoadMetaPartition] marshal")
return
}
p.PacketOkWithBody(data)
return
}
// MarshalJSON is the wrapper of json.Marshal.
func (mp *metaPartition) MarshalJSON() ([]byte, error) {
return json.Marshal(mp.config)
}
// TODO remove? no usage?
// Reset resets the meta partition.
func (mp *metaPartition) Reset() (err error) {
mp.inodeTree.Reset()
mp.dentryTree.Reset()
mp.config.Cursor = 0
mp.config.UniqId = 0
mp.applyID = 0
mp.txProcessor.Reset()
// remove files
filenames := []string{applyIDFile, dentryFile, inodeFile, extendFile, multipartFile, verdataFile, txInfoFile, txRbInodeFile, txRbDentryFile, TxIDFile}
for _, filename := range filenames {
filepath := path.Join(mp.config.RootDir, filename)
if err = os.Remove(filepath); err != nil {
return
}
}
return
}
func (mp *metaPartition) canRemoveSelf() (canRemove bool, err error) {
var partition *proto.MetaPartitionInfo
if partition, err = masterClient.ClientAPI().GetMetaPartition(mp.config.PartitionId); err != nil {
log.LogErrorf("action[canRemoveSelf] err[%v]", err)
return
}
canRemove = false
var existInPeers bool
for _, peer := range partition.Peers {
if mp.config.NodeId == peer.ID {
existInPeers = true
}
}
if !existInPeers {
canRemove = true
return
}
if mp.config.NodeId == partition.OfflinePeerID {
canRemove = true
return
}
return
}
// cacheTTLWork only happen in datalake situation
func (mp *metaPartition) multiVersionTTLWork(dur time.Duration) {
// do cache ttl work
// first sleep a rand time, range [0, 1200s(20m)],
// make sure all mps is not doing scan work at the same time.
rand.Seed(time.Now().Unix())
time.Sleep(time.Duration(rand.Intn(60)))
log.LogDebugf("[multiVersionTTLWork] start, mp[%v]", mp.config.PartitionId)
ttl := time.NewTicker(dur)
snapQueue := make(chan interface{}, 5)
for {
select {
case <-ttl.C:
log.LogDebugf("[multiVersionTTLWork] begin cache ttl, mp[%v]", mp.config.PartitionId)
mp.multiVersionList.RWLock.RLock()
volVersionInfoList := &proto.VolVersionInfoList{
TemporaryVerMap: make(map[uint64]*proto.VolVersionInfo),
}
copy(volVersionInfoList.VerList, mp.multiVersionList.VerList)
for key, value := range mp.multiVersionList.TemporaryVerMap {
copiedValue := *value
volVersionInfoList.TemporaryVerMap[key] = &copiedValue
}
mp.multiVersionList.RWLock.RUnlock()
for _, version := range volVersionInfoList.TemporaryVerMap {
if version.Status == proto.VersionDeleting {
continue
}
snapQueue <- nil
version.Status = proto.VersionDeleting
go func(verSeq uint64) {
mp.delPartitionVersion(verSeq)
mp.multiVersionList.RWLock.Lock()
delete(mp.multiVersionList.TemporaryVerMap, verSeq)
mp.multiVersionList.RWLock.Unlock()
<-snapQueue
}(version.Ver)
}
case <-mp.stopC:
log.LogWarnf("[multiVersionTTLWork] stoped, mp[%v]", mp.config.PartitionId)
return
}
}
return
}
func (mp *metaPartition) delPartitionVersion(verSeq uint64) {
var wg sync.WaitGroup
wg.Add(3)
reqVerSeq := verSeq
if reqVerSeq == 0 {
reqVerSeq = math.MaxUint64
}
log.LogInfof("action[delPartitionVersion] mp[%v] verseq [%v]:%v", mp.config.PartitionId, verSeq, reqVerSeq)
go mp.delPartitionInodesVersion(reqVerSeq, &wg)
go mp.delPartitionExtendsVersion(reqVerSeq, &wg)
go mp.delPartitionDentriesVersion(reqVerSeq, &wg)
wg.Wait()
}
func (mp *metaPartition) delPartitionDentriesVersion(verSeq uint64, wg *sync.WaitGroup) {
defer wg.Done()
// begin
count := 0
needSleep := false
mp.dentryTree.GetTree().Ascend(func(i BtreeItem) bool {
if _, ok := mp.IsLeader(); !ok {
return false
}
den := i.(*Dentry)
// dir type just skip
p := &Packet{}
req := &proto.DeleteDentryRequest{
VolName: mp.config.VolName,
ParentID: mp.config.PartitionId,
PartitionID: den.ParentId,
Name: den.Name,
Verseq: verSeq,
}
mp.DeleteDentry(req, p, localAddrForAudit)
// check empty result.
// if result is OpAgain, means the extDelCh maybe full,
// so let it sleep 1s.
if p.ResultCode == proto.OpAgain {
needSleep = true
}
// every 1000 inode sleep 1s
if count > 1000 || needSleep {
count %= 1000
needSleep = false
time.Sleep(time.Second)
}
return true
})
}
func (mp *metaPartition) delPartitionExtendsVersion(verSeq uint64, wg *sync.WaitGroup) {
defer wg.Done()
// begin
count := 0
needSleep := false
mp.extendTree.GetTree().Ascend(func(treeItem BtreeItem) bool {
if _, ok := mp.IsLeader(); !ok {
return false
}
e := treeItem.(*Extend)
p := &Packet{}
req := &proto.RemoveXAttrRequest{
VolName: mp.config.VolName,
PartitionId: mp.config.PartitionId,
Inode: e.inode,
VerSeq: verSeq,
}
mp.RemoveXAttr(req, p)
// check empty result.
// if result is OpAgain, means the extDelCh maybe full,
// so let it sleep 1s.
if p.ResultCode == proto.OpAgain {
needSleep = true
}
// every 1000 inode sleep 1s
if count > 1000 || needSleep {
count %= 1000
needSleep = false
time.Sleep(time.Second)
}
return true
})
}
func (mp *metaPartition) delPartitionInodesVersion(verSeq uint64, wg *sync.WaitGroup) {
defer wg.Done()
// begin
count := 0
needSleep := false
mp.inodeTree.GetTree().Ascend(func(i BtreeItem) bool {
if _, ok := mp.IsLeader(); !ok {
return false
}
inode := i.(*Inode)
// dir type just skip
if proto.IsDir(inode.Type) {
return true
}
inode.RLock()
// eks is empty just skip
if ok, _ := inode.ShouldDelVer(verSeq, mp.verSeq); !ok {
inode.RUnlock()
return true
}
p := &Packet{}
req := &proto.UnlinkInodeRequest{
Inode: inode.Inode,
VerSeq: verSeq,
}
inode.RUnlock()
mp.UnlinkInode(req, p, localAddrForAudit)
// check empty result.
// if result is OpAgain, means the extDelCh maybe full,
// so let it sleep 1s.
if p.ResultCode == proto.OpAgain {
needSleep = true
}
// every 1000 inode sleep 1s
if count > 1000 || needSleep {
count %= 1000
needSleep = false
time.Sleep(time.Second)
}
return true
})
return
}
// cacheTTLWork only happen in datalake situation
func (mp *metaPartition) cacheTTLWork() (err error) {
// check volume type, only Cold volume will do the cache ttl.
volView, mcErr := masterClient.ClientAPI().GetVolumeWithoutAuthKey(mp.config.VolName)
if mcErr != nil {
err = fmt.Errorf("cacheTTLWork: can't get volume info: partitoinID(%v) volume(%v)",
mp.config.PartitionId, mp.config.VolName)
return
}
if volView.VolType != proto.VolumeTypeCold {
return
}
if mp.verSeq > 0 {
log.LogWarnf("[doCacheTTL] volume [%v] enable snapshot.exit cache ttl, mp[%v]", mp.GetVolName(), mp.config.PartitionId)
return
}
// do cache ttl work
go mp.doCacheTTL(volView.CacheTTL)
return
}
func (mp *metaPartition) doCacheTTL(cacheTTL int) (err error) {
// first sleep a rand time, range [0, 1200s(20m)],
// make sure all mps is not doing scan work at the same time.
rand.Seed(time.Now().Unix())
time.Sleep(time.Duration(rand.Intn(1200)))
ttl := time.NewTicker(time.Duration(util.OneDaySec()) * time.Second)
for {
select {
case <-ttl.C:
if mp.verSeq > 0 {
log.LogWarnf("[doCacheTTL] volume [%v] enable snapshot.exit cache ttl, mp[%v] cacheTTL[%v]",
mp.GetVolName(), mp.config.PartitionId, cacheTTL)
return
}
log.LogDebugf("[doCacheTTL] begin cache ttl, mp[%v] cacheTTL[%v]", mp.config.PartitionId, cacheTTL)
// only leader can do TTL work
if _, ok := mp.IsLeader(); !ok {
log.LogDebugf("[doCacheTTL] partitionId=%d is not leader, skip", mp.config.PartitionId)
continue
}
// get the last cacheTTL
volView, mcErr := masterClient.ClientAPI().GetVolumeWithoutAuthKey(mp.config.VolName)
if mcErr != nil {
err = fmt.Errorf("[doCacheTTL]: can't get volume info: partitoinID(%v) volume(%v)",
mp.config.PartitionId, mp.config.VolName)
return
}
cacheTTL = volView.CacheTTL
mp.InodeTTLScan(cacheTTL)
case <-mp.stopC:
log.LogWarnf("[doCacheTTL] stoped, mp[%v]", mp.config.PartitionId)
return
}
}
}
func (mp *metaPartition) InodeTTLScan(cacheTTL int) {
curTime := timeutil.GetCurrentTimeUnix()
// begin
count := 0
needSleep := false
mp.inodeTree.GetTree().Ascend(func(i BtreeItem) bool {
inode := i.(*Inode)
// dir type just skip
if proto.IsDir(inode.Type) {
return true
}
inode.RLock()
// eks is empty just skip
if len(inode.Extents.eks) == 0 || inode.ShouldDelete() {
inode.RUnlock()
return true
}
if (curTime - inode.AccessTime) > int64(cacheTTL)*util.OneDaySec() {
log.LogDebugf("[InodeTTLScan] mp[%v] do inode ttl delete[%v]", mp.config.PartitionId, inode.Inode)
count++
// make request
p := &Packet{}
req := &proto.EmptyExtentKeyRequest{
Inode: inode.Inode,
}
ino := NewInode(req.Inode, 0)
curTime = timeutil.GetCurrentTimeUnix()
if inode.ModifyTime < curTime {
ino.ModifyTime = curTime
}
mp.ExtentsOp(p, ino, opFSMExtentsEmpty)
// check empty result.
// if result is OpAgain, means the extDelCh maybe full,
// so let it sleep 1s.
if p.ResultCode == proto.OpAgain {
needSleep = true
}
}
inode.RUnlock()
// every 1000 inode sleep 1s
if count > 1000 || needSleep {
count %= 1000
needSleep = false
time.Sleep(time.Second)
}
return true
})
}
func (mp *metaPartition) initTxInfo(txInfo *proto.TransactionInfo) error {
txInfo.TxID = mp.txProcessor.txManager.nextTxID()
txInfo.CreateTime = time.Now().Unix()
txInfo.State = proto.TxStatePreCommit
if mp.txProcessor.txManager.opLimiter.Allow() {
return nil
}
return fmt.Errorf("tx create is limited")
}
func (mp *metaPartition) storeSnapshotFiles() (err error) {
msg := &storeMsg{
applyIndex: mp.applyID,
txId: mp.txProcessor.txManager.txIdAlloc.getTransactionID(),
inodeTree: NewBtree(),
dentryTree: NewBtree(),
extendTree: NewBtree(),
multipartTree: NewBtree(),
txTree: NewBtree(),
txRbInodeTree: NewBtree(),
txRbDentryTree: NewBtree(),
uniqId: mp.GetUniqId(),
uniqChecker: newUniqChecker(),
multiVerList: mp.multiVersionList.VerList,
}
return mp.store(msg)
}
func (mp *metaPartition) startCheckerEvict() {
timer := time.NewTimer(opCheckerInterval)
for {
select {
case <-timer.C:
if _, ok := mp.IsLeader(); ok {
left, evict, err := mp.uniqCheckerEvict()
if evict != 0 {
log.LogInfof("[uniqChecker] after doEvict partition-%d, left:%d, evict:%d, err:%v", mp.config.PartitionId, left, evict, err)
} else {
log.LogDebugf("[uniqChecker] after doEvict partition-%d, left:%d, evict:%d, err:%v", mp.config.PartitionId, left, evict, err)
}
}
timer.Reset(opCheckerInterval)
case <-mp.stopC:
return
}
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"container/list"
"encoding/binary"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"strings"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/synclist"
)
const (
prefixDelExtent = "EXTENT_DEL"
prefixDelExtentV2 = "EXTENT_DEL_V2"
prefixMultiVer = verdataFile
maxDeleteExtentSize = 10 * MB
)
var extentsFileHeader = []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08}
// start metapartition delete extents work
func (mp *metaPartition) startToDeleteExtents() {
fileList := synclist.New()
go mp.appendDelExtentsToFile(fileList)
go mp.deleteExtentsFromList(fileList)
}
// create extent delete file
func (mp *metaPartition) createExtentDeleteFile(prefix string, idx int64, fileList *synclist.SyncList) (fp *os.File, fileName string, fileSize int64, err error) {
fileName = fmt.Sprintf("%s_%d", prefix, idx)
fp, err = os.OpenFile(path.Join(mp.config.RootDir, fileName),
os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
if err != nil {
log.LogErrorf("[metaPartition] createExtentDeletFile openFile %v %v error %v", mp.config.RootDir, fileName, err)
return
}
if _, err = fp.Write(extentsFileHeader); err != nil {
log.LogErrorf("[metaPartition] createExtentDeletFile Write %v %v error %v", mp.config.RootDir, fileName, err)
}
fileSize = int64(len(extentsFileHeader))
fileList.PushBack(fileName)
return
}
// append delete extents from extDelCh to EXTENT_DEL_N files
func (mp *metaPartition) appendDelExtentsToFile(fileList *synclist.SyncList) {
defer func() {
if r := recover(); r != nil {
log.LogErrorf(fmt.Sprintf("[metaPartition] appendDelExtentsToFile pid(%v) panic (%v)", mp.config.PartitionId, r))
}
}()
var (
fileName string
fileSize int64
idx int64
fp *os.File
err error
)
LOOP:
// scan existed EXTENT_DEL_* files to fill fileList
finfos, err := ioutil.ReadDir(mp.config.RootDir)
if err != nil {
panic(err)
}
finfos = sortDelExtFileInfo(finfos)
for _, info := range finfos {
fileList.PushBack(info.Name())
fileSize = info.Size()
}
// check
lastItem := fileList.Back()
if lastItem != nil {
fileName = lastItem.Value.(string)
}
if lastItem == nil || !strings.HasPrefix(fileName, prefixDelExtentV2) {
// if no exist EXTENT_DEL_*, create one
log.LogDebugf("action[appendDelExtentsToFile] verseq [%v]", mp.verSeq)
fp, fileName, fileSize, err = mp.createExtentDeleteFile(prefixDelExtentV2, idx, fileList)
log.LogDebugf("action[appendDelExtentsToFile] verseq [%v] fileName %v", mp.verSeq, fileName)
if err != nil {
panic(err)
}
} else {
// exist, open last file
fp, err = os.OpenFile(path.Join(mp.config.RootDir, fileName),
os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
if err != nil {
panic(err)
}
// continue from last item
idx = getDelExtFileIdx(fileName)
}
log.LogDebugf("action[appendDelExtentsToFile] verseq [%v] fileName %v", mp.verSeq, fileName)
// TODO Unhandled errors
defer fp.Close()
buf := make([]byte, 0)
for {
select {
case <-mp.stopC:
return
case <-mp.extReset:
// TODO Unhandled errors
fp.Close()
// reset fileList
fileList.Init()
goto LOOP
case eks := <-mp.extDelCh:
var data []byte
buf = buf[:0]
if len(eks) == 0 {
goto LOOP
}
log.LogDebugf("[appendDelExtentsToFile] mp(%v) del eks [%v]", mp.config.PartitionId, eks)
for _, ek := range eks {
data, err = ek.MarshalBinaryWithCheckSum(true)
if err != nil {
log.LogWarnf("[appendDelExtentsToFile] partitionId=%d,"+
" extentKey marshal: %s", mp.config.PartitionId, err.Error())
break
}
buf = append(buf, data...)
}
if err != nil {
err = mp.sendExtentsToChan(eks)
if err != nil {
log.LogErrorf("[appendDelExtentsToFile] mp[%v] sendExtentsToChan fail, err(%s)", mp.config.PartitionId, err.Error())
}
continue
}
if fileSize >= maxDeleteExtentSize {
// TODO Unhandled errors
// close old File
fp.Close()
idx += 1
fp, fileName, fileSize, err = mp.createExtentDeleteFile(prefixDelExtentV2, idx, fileList)
if err != nil {
panic(err)
}
log.LogDebugf("appendDelExtentsToFile. volname [%v] mp[%v] createExtentDeleteFile %v",
mp.GetVolName(), mp.config.PartitionId, fileName)
}
// write delete extents into file
if _, err = fp.Write(buf); err != nil {
panic(err)
}
fileSize += int64(len(buf))
log.LogDebugf("action[appendDelExtentsToFile] filesize now %v", fileSize)
}
}
}
func (mp *metaPartition) batchDeleteExtentsByDp(dpId uint64, extents []*proto.ExtentKey) (err error) {
dp := mp.vol.GetPartition(dpId)
if dp == nil {
log.LogErrorf("[batchDeleteExtentsByDp] mp(%v) dp(%v) not found", mp.config.PartitionId, dpId)
err = fmt.Errorf("dp %v is not found", dpId)
return
}
if dp.IsDiscard {
log.LogDebugf("[batchDeleteExtentsByDp] mp(%v) dp(%v) is discard", mp.config.PartitionId, dpId)
return
}
log.LogDebugf("[batchDeleteExtentsByDp] mp(%v) delete eks from dp(%v)", mp.config.PartitionId, dpId)
err = mp.doBatchDeleteExtentsByPartition(dpId, extents)
return
}
// Delete all the extents of a file.
func (mp *metaPartition) deleteExtentsFromList(fileList *synclist.SyncList) {
defer func() {
if r := recover(); r != nil {
log.LogErrorf(fmt.Sprintf("deleteExtentsFromList(%v) deleteExtentsFromList panic (%v)", mp.config.PartitionId, r))
}
}()
var (
element *list.Element
fileName string
file string
fileInfo os.FileInfo
err error
)
for {
// DeleteWorkerSleepMs()
time.Sleep(1 * time.Minute)
select {
case <-mp.stopC:
return
default:
}
element = fileList.Front()
if element == nil {
continue
}
fileName = element.Value.(string)
file = path.Join(mp.config.RootDir, fileName)
if fileInfo, err = os.Stat(file); err != nil {
log.LogDebugf("[deleteExtentsFromList] mp(%v) skip file(%v)", mp.config.PartitionId, fileName)
fileList.Remove(element)
continue
}
log.LogDebugf("[deleteExtentsFromList] mp(%v) reading file(%v)", mp.config.PartitionId, fileName)
// if not leader, ignore delete
if _, ok := mp.IsLeader(); !ok {
log.LogDebugf("[deleteExtentsFromList] partitionId=%d, "+
"not raft leader,please ignore", mp.config.PartitionId)
continue
}
// leader do delete extent for EXTENT_DEL_* file
// read delete extents from file
buf := make([]byte, 8)
fp, err := os.OpenFile(file, os.O_RDWR, 0o644)
if err != nil {
if !os.IsNotExist(err) {
log.LogErrorf("[deleteExtentsFromList] volname [%v] mp[%v] openFile %v error: %v", mp.GetVolName(), mp.config.PartitionId, file, err)
} else {
log.LogDebugf("[deleteExtentsFromList] mp(%v) delete extents file(%v) deleted", mp.config.PartitionId, fileName)
}
fileList.Remove(element)
continue
}
// get delete extents cursor at file header 8 bytes
if _, err = fp.ReadAt(buf, 0); err != nil {
log.LogWarnf("[deleteExtentsFromList] partitionId=%d, "+
"read cursor least 8bytes, retry later", mp.config.PartitionId)
// TODO Unhandled errors
fp.Close()
continue
}
extentV2 := false
extentKeyLen := uint64(proto.ExtentLength)
if strings.HasPrefix(fileName, prefixDelExtentV2) {
extentV2 = true
extentKeyLen = uint64(proto.ExtentV2Length)
}
cursor := binary.BigEndian.Uint64(buf)
stat, err := fp.Stat()
if err != nil {
log.LogErrorf("[deleteExtentsFromList] mp(%v) stat file(%v) err(%v)", mp.config.PartitionId, fileName, err)
continue
}
log.LogDebugf("[deleteExtentsFromList] volname [%v] mp[%v] o openFile %v file len %v cursor %v", mp.GetVolName(), mp.config.PartitionId, file,
stat.Size(), cursor)
log.LogDebugf("action[deleteExtentsFromList] get cursor %v", cursor)
if fileInfo.Size() == int64(cursor) {
log.LogDebugf("[deleteExtentsFromList] mp(%v) reach the end of file(%v), sleep", mp.config.PartitionId, fileName)
fp.Close()
continue
} else if fileInfo.Size() > int64(cursor) && fileInfo.Size() < int64(cursor)+int64(extentKeyLen) {
log.LogErrorf("[deleteExtentsFromList] mp(%d), file(%v) corrupted!", mp.config.PartitionId, fileName)
fileList.Remove(element)
fp.Close()
continue
}
var deleteCnt uint64
errExts := make([]proto.ExtentKey, 0)
needDeleteExtents := make(map[uint64][]*proto.ExtentKey)
buf = make([]byte, util.MB)
err = func() (err error) {
// read extents from cursor
defer fp.Close()
// NOTE: read 1 MB at once
rLen, err := fp.ReadAt(buf, int64(cursor))
log.LogDebugf("[deleteExtentsFromList] mp(%v) read len(%v) cursor(%v), err(%v)", mp.config.PartitionId, rLen, cursor, err)
if err != nil {
if err == io.EOF {
err = nil
if rLen == 0 {
log.LogDebugf("[deleteExtentsFromList] mp(%v) file list cnt(%v)", mp.config.PartitionId, fileList.Len())
if fileList.Len() <= 1 {
log.LogDebugf("[deleteExtentsFromList] mp(%v) skip delete file(%v), free list count(%v)", mp.config.PartitionId, fileName, fileList.Len())
return
}
status := mp.raftPartition.Status()
_, isLeader := mp.IsLeader()
if isLeader && !status.RestoringSnapshot {
// delete old delete extents file for metapartition
if _, err = mp.submit(opFSMInternalDelExtentFile, []byte(fileName)); err != nil {
log.LogErrorf("[deleteExtentsFromList] mp(%v), delete old file(%v), err(%v)", mp.config.PartitionId, fileName, err)
return
}
log.LogDebugf("[deleteExtentsFromList] mp(%v), delete old file(%v)", mp.config.PartitionId, fileName)
return
}
log.LogDebugf("[deleteExtentsFromList] partitionId=%d,delete"+
" old file status: %s", mp.config.PartitionId, status.State)
}
} else {
log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to read file(%v), err(%v)", mp.config.PartitionId, fileName, err)
return
}
}
cursor += uint64(rLen)
buff := bytes.NewBuffer(buf[:rLen])
batchCount := DeleteBatchCount() * 5
for buff.Len() != 0 && deleteCnt < batchCount {
lastUnread := buff.Len()
// NOTE: audjust cursor
if uint64(buff.Len()) < extentKeyLen {
cursor -= uint64(lastUnread)
break
}
if extentV2 && uint64(buff.Len()) < uint64(proto.ExtentV3Length) {
if r := bytes.Compare(buff.Bytes()[:4], proto.ExtentKeyHeaderV3); r == 0 {
cursor -= uint64(lastUnread)
break
}
}
// NOTE: read ek
ek := proto.ExtentKey{}
if extentV2 {
if err = ek.UnmarshalBinaryWithCheckSum(buff); err != nil {
if err == proto.InvalidKeyHeader || err == proto.InvalidKeyCheckSum {
log.LogErrorf("[deleteExtentsFromList] invalid extent key header %v, %v, %v", fileName, mp.config.PartitionId, err)
return
}
log.LogErrorf("[deleteExtentsFromList] mp: %v Unmarshal extentkey from %v unresolved error: %v", mp.config.PartitionId, fileName, err)
return
}
} else {
// ek for del no need to get version
if err = ek.UnmarshalBinary(buff, false); err != nil {
log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to unmarshal extent", mp.config.PartitionId)
return
}
}
// NOTE: add to current batch
dpId := ek.PartitionId
eks := needDeleteExtents[dpId]
if eks == nil {
eks = make([]*proto.ExtentKey, 0)
}
eks = append(eks, &ek)
needDeleteExtents[dpId] = eks
// NOTE: limit batch count
deleteCnt++
log.LogDebugf("[deleteExtentsFromList] mp(%v) append extent(%v) to batch, count limit(%v), cnt(%v)", mp.config.PartitionId, ek, batchCount, deleteCnt)
}
log.LogDebugf("[deleteExtentsFromList] mp(%v) reach the end of buffer", mp.config.PartitionId)
return
}()
if err != nil {
log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to read delete file(%v), err(%v)", mp.config.PartitionId, fileName, err)
continue
}
if deleteCnt == 0 {
log.LogDebugf("[deleteExtentsFromList] mp(%v) delete cnt is 0, sleep", mp.config.PartitionId)
continue
}
successCnt := 0
for dpId, eks := range needDeleteExtents {
log.LogDebugf("[deleteExtentsFromList] mp(%v) delete dp(%v) eks count(%v)", mp.config.PartitionId, dpId, len(eks))
err = mp.batchDeleteExtentsByDp(dpId, eks)
if err != nil {
log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to delete dp(%v) extents", mp.config.PartitionId, dpId)
err = nil
for _, ek := range eks {
errExts = append(errExts, *ek)
}
} else {
successCnt += len(eks)
}
}
log.LogDebugf("[deleteExtentsFromList] mp(%v) delete success cnt(%v), err cnt(%v)", mp.config.PartitionId, successCnt, len(errExts))
if successCnt == 0 {
log.LogErrorf("[deleteExtentsFromList] no extents delete successfully, sleep")
continue
}
if len(errExts) != 0 {
log.LogDebugf("[deleteExtentsFromList] mp(%v) sync errExts(%v)", mp.config.PartitionId, errExts)
err = mp.sendExtentsToChan(errExts)
if err != nil {
log.LogErrorf("[deleteExtentsFromList] sendExtentsToChan by raft error, mp[%v], err(%v), ek(%v)", mp.config.PartitionId, err, len(errExts))
}
}
buff := bytes.NewBuffer([]byte{})
buff.WriteString(fmt.Sprintf("%s %d", fileName, cursor))
log.LogDebugf("[deleteExtentsFromList] mp(%v) delete eks(%v) from file(%v)", mp.config.PartitionId, deleteCnt, fileName)
if _, err = mp.submit(opFSMInternalDelExtentCursor, buff.Bytes()); err != nil {
log.LogWarnf("[deleteExtentsFromList] partitionId=%d, %s",
mp.config.PartitionId, err.Error())
}
log.LogDebugf("[deleteExtentsFromList] mp(%v) file(%v), cursor(%v), size(%v)", mp.config.PartitionId, fileName, cursor, len(buf))
}
}
// func (mp *metaPartition) checkBatchDeleteExtents(allExtents map[uint64][]*proto.ExtentKey) {
// for partitionID, deleteExtents := range allExtents {
// needDeleteExtents := make([]proto.ExtentKey, len(deleteExtents))
// for index, ek := range deleteExtents {
// newEx := proto.ExtentKey{
// FileOffset: ek.FileOffset,
// PartitionId: ek.PartitionId,
// ExtentId: ek.ExtentId,
// ExtentOffset: ek.ExtentOffset,
// Size: ek.Size,
// CRC: ek.CRC,
// }
// needDeleteExtents[index] = newEx
// log.LogWritef("mp[%v] deleteExtents(%v)", mp.config.PartitionId, newEx.String())
// }
// err := mp.doBatchDeleteExtentsByPartition(partitionID, deleteExtents)
// if err != nil {
// log.LogWarnf(fmt.Sprintf("metaPartition(%v) dataPartitionID(%v)"+
// " batchDeleteExtentsByPartition failed(%v)", mp.config.PartitionId, partitionID, err))
// mp.extDelCh <- needDeleteExtents
// }
// DeleteWorkerSleepMs()
// }
// return
// }
package metanode
import (
"fmt"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/exporter"
)
type FileSizeRange uint32
const (
Size1K uint64 = 2 << 10
Size1M uint64 = 2 << 20
Size16M = 16 * Size1M
Size32M = 32 * Size1M
Size64M = 64 * Size1M
Size128M = 128 * Size1M
Size256M = 256 * Size1M
)
const (
LessThan1K FileSizeRange = iota
LessThan1M
LessThan16M
LessThan32M
LessThan64M
LessThan128M
LessThan256M
BiggerThan256M
MaxRangeType
)
const (
fileStatsCheckPeriod = time.Second * 30
)
func toString(fileSize FileSizeRange) string {
switch fileSize {
case LessThan1K:
return "<1K"
case LessThan1M:
return "<1M"
case LessThan16M:
return "<16M"
case LessThan32M:
return "<32M"
case LessThan64M:
return "<64M"
case LessThan128M:
return "<128M"
case LessThan256M:
return "<256M"
case BiggerThan256M:
return ">256M"
default:
return "unknown"
}
}
func (mp *metaPartition) setMetrics(fileRange []int64) {
for i, val := range fileRange {
labels := map[string]string{
"partid": fmt.Sprintf("%d", mp.config.PartitionId),
"volName": mp.config.VolName,
"sizeRange": toString(FileSizeRange(i)),
}
exporter.NewGauge("fileStats").SetWithLabels(float64(val), labels)
}
}
func (mp *metaPartition) fileStats(ino *Inode) {
if !mp.manager.fileStatsEnable {
return
}
fileRange := mp.fileRange
if ino.NLink > 0 && proto.IsRegular(ino.Type) {
if 0 <= ino.Size && ino.Size < Size1K {
fileRange[LessThan1K] += 1
} else if Size1K <= ino.Size && ino.Size < Size1M {
fileRange[LessThan1M] += 1
} else if Size1M <= ino.Size && ino.Size < Size16M {
fileRange[LessThan16M] += 1
} else if Size16M <= ino.Size && ino.Size < Size32M {
fileRange[LessThan32M] += 1
} else if Size32M <= ino.Size && ino.Size < Size64M {
fileRange[LessThan64M] += 1
} else if Size64M <= ino.Size && ino.Size < Size128M {
fileRange[LessThan128M] += 1
} else if Size128M <= ino.Size && ino.Size < Size256M {
fileRange[LessThan256M] += 1
} else {
fileRange[BiggerThan256M] += 1
}
}
}
func (mp *metaPartition) startFileStats() {
checkTicker := time.NewTicker(fileStatsCheckPeriod)
go func(stopC chan bool) {
lastEnable := false
isLeader := false
for {
select {
case <-stopC:
// if this mp is closed, clear the metric
if lastEnable {
fileRange := make([]int64, MaxRangeType)
mp.setMetrics(fileRange)
}
checkTicker.Stop()
return
case <-checkTicker.C:
if !mp.manager.fileStatsEnable {
// if fileStatsEnable change from true to false, clear the metric
if lastEnable {
fileRange := make([]int64, MaxRangeType)
mp.setMetrics(fileRange)
}
lastEnable = false
continue
}
lastEnable = true
// Clear the metric if status change from leader to follower
if _, isLeader = mp.IsLeader(); isLeader {
mp.setMetrics(mp.fileRange)
} else {
fileRange := make([]int64, MaxRangeType)
mp.setMetrics(fileRange)
}
}
}
}(mp.stopC)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"fmt"
"net"
"os"
"path"
"runtime/debug"
"sort"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/fileutil"
"github.com/cubefs/cubefs/util/log"
)
const (
AsyncDeleteInterval = 10 * time.Second
UpdateVolTicket = 2 * time.Minute
BatchCounts = 128
OpenRWAppendOpt = os.O_CREATE | os.O_RDWR | os.O_APPEND
TempFileValidTime = 86400 // units: sec
DeleteInodeFileExtension = "INODE_DEL"
DeleteWorkerCnt = 10
InodeNLink0DelayDeleteSeconds = 24 * 3600
DeleteInodeFileRollingSize = 500 * util.MB
)
func (mp *metaPartition) openDeleteInodeFile() (err error) {
if mp.delInodeFp, err = os.OpenFile(path.Join(mp.config.RootDir,
DeleteInodeFileExtension), OpenRWAppendOpt, 0o644); err != nil {
log.LogErrorf("[openDeleteInodeFile] failed to open delete inode file, err(%v)", err)
return
}
return
}
func (mp *metaPartition) startFreeList() (err error) {
if err = mp.openDeleteInodeFile(); err != nil {
return
}
// start vol update ticket
go mp.updateVolWorker()
go mp.deleteWorker()
mp.startToDeleteExtents()
return
}
func (mp *metaPartition) updateVolView(convert func(view *proto.DataPartitionsView) *DataPartitionsView) (err error) {
volName := mp.config.VolName
dataView, err := masterClient.ClientAPI().EncodingGzip().GetDataPartitions(volName)
if err != nil {
err = fmt.Errorf("updateVolWorker: get data partitions view fail: volume(%v) err(%v)",
volName, err)
log.LogErrorf(err.Error())
return
}
mp.vol.UpdatePartitions(convert(dataView))
volView, err := masterClient.AdminAPI().GetVolumeSimpleInfo(volName)
if err != nil {
err = fmt.Errorf("updateVolWorker: get volumeinfo fail: volume(%v) err(%v)", volName, err)
log.LogErrorf(err.Error())
return
}
mp.vol.volDeleteLockTime = volView.DeleteLockTime
return nil
}
func (mp *metaPartition) updateVolWorker() {
t := time.NewTicker(UpdateVolTicket)
convert := func(view *proto.DataPartitionsView) *DataPartitionsView {
newView := &DataPartitionsView{
DataPartitions: make([]*DataPartition, len(view.DataPartitions)),
}
for i := 0; i < len(view.DataPartitions); i++ {
if len(view.DataPartitions[i].Hosts) < 1 {
log.LogErrorf("updateVolWorker dp id(%v) is invalid, DataPartitionResponse detail[%v]",
view.DataPartitions[i].PartitionID, view.DataPartitions[i])
continue
}
newView.DataPartitions[i] = &DataPartition{
PartitionID: view.DataPartitions[i].PartitionID,
Status: view.DataPartitions[i].Status,
Hosts: view.DataPartitions[i].Hosts,
ReplicaNum: view.DataPartitions[i].ReplicaNum,
IsDiscard: view.DataPartitions[i].IsDiscard,
}
}
return newView
}
mp.updateVolView(convert)
for {
select {
case <-mp.stopC:
t.Stop()
return
case <-t.C:
mp.updateVolView(convert)
}
}
}
const (
MinDeleteBatchCounts = 100
MaxSleepCnt = 10
)
func (mp *metaPartition) deleteWorker() {
var (
idx int
isLeader bool
)
buffSlice := make([]uint64, 0, DeleteBatchCount())
var sleepCnt uint64
for {
buffSlice = buffSlice[:0]
select {
case <-mp.stopC:
log.LogDebugf("[metaPartition] deleteWorker stop partition: %v", mp.config)
return
default:
}
if _, isLeader = mp.IsLeader(); !isLeader {
time.Sleep(AsyncDeleteInterval)
continue
}
// add sleep time value
DeleteWorkerSleepMs()
isForceDeleted := sleepCnt%MaxSleepCnt == 0
if !isForceDeleted && mp.freeList.Len() < MinDeleteBatchCounts {
time.Sleep(AsyncDeleteInterval)
sleepCnt++
continue
}
// do nothing.
if mp.freeList.Len() == 0 {
time.Sleep(time.Minute)
continue
}
batchCount := DeleteBatchCount()
delayDeleteInos := make([]uint64, 0)
for idx = 0; idx < int(batchCount); idx++ {
// batch get free inode from the freeList
ino := mp.freeList.Pop()
if ino == 0 {
break
}
log.LogDebugf("action[deleteWorker]: remove inode(%v)", ino)
// check inode nlink == 0 and deleteMarkFlag unset
if inode, ok := mp.inodeTree.Get(&Inode{Inode: ino}).(*Inode); ok {
inTx, _ := mp.txProcessor.txResource.isInodeInTransction(inode)
if inode.ShouldDelayDelete() || inTx {
log.LogDebugf("[metaPartition] deleteWorker delay to remove inode: %v as NLink is 0, inTx %v", inode, inTx)
delayDeleteInos = append(delayDeleteInos, ino)
continue
}
}
buffSlice = append(buffSlice, ino)
}
// delay
for _, delayDeleteIno := range delayDeleteInos {
mp.freeList.Push(delayDeleteIno)
}
log.LogDebugf("metaPartition. buff slice [%v]", buffSlice)
mp.persistDeletedInodes(buffSlice)
mp.deleteMarkedInodes(buffSlice)
sleepCnt++
}
}
// delete Extents by Partition,and find all successDelete inode
func (mp *metaPartition) batchDeleteExtentsByPartition(partitionDeleteExtents map[uint64][]*proto.ExtentKey,
allInodes []*Inode) (shouldCommit []*Inode, shouldPushToFreeList []*Inode) {
occurErrors := make(map[uint64]error)
shouldCommit = make([]*Inode, 0, len(allInodes))
shouldPushToFreeList = make([]*Inode, 0)
var (
wg sync.WaitGroup
lock sync.Mutex
)
// wait all Partition do BatchDeleteExtents finish
for partitionID, extents := range partitionDeleteExtents {
dp := mp.vol.GetPartition(partitionID)
// NOTE: if dp is discard, skip it
if dp.IsDiscard {
log.LogWarnf("action[batchDeleteExtentsByPartition] dp(%v) is discard, skip extents count(%v)", partitionID, len(extents))
continue
}
log.LogDebugf("batchDeleteExtentsByPartition partitionID %v extents %v", partitionID, extents)
wg.Add(1)
go func(partitionID uint64, extents []*proto.ExtentKey) {
defer wg.Done()
perr := mp.doBatchDeleteExtentsByPartition(partitionID, extents)
lock.Lock()
occurErrors[partitionID] = perr
lock.Unlock()
}(partitionID, extents)
}
wg.Wait()
// range AllNode,find all Extents delete success on inode,it must to be append shouldCommit
for i := 0; i < len(allInodes); i++ {
successDeleteExtentCnt := 0
inode := allInodes[i]
inode.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
if occurErrors[ek.PartitionId] != nil {
log.LogWarnf("deleteInode inode[%v] error(%v)", inode.Inode, occurErrors[ek.PartitionId])
return false
}
successDeleteExtentCnt++
return true
})
if successDeleteExtentCnt == inode.Extents.Len() {
shouldCommit = append(shouldCommit, inode)
log.LogDebugf("action[batchDeleteExtentsByPartition]: delete inode(%v) success", inode)
} else {
shouldPushToFreeList = append(shouldPushToFreeList, inode)
log.LogDebugf("action[batchDeleteExtentsByPartition]: delete inode(%v) fail", inode)
}
}
return
}
// Delete the marked inodes.
func (mp *metaPartition) deleteMarkedInodes(inoSlice []uint64) {
defer func() {
if r := recover(); r != nil {
stack := string(debug.Stack())
log.LogErrorf(fmt.Sprintf("metaPartition(%v) deleteMarkedInodes panic (%v)\nstack:%v",
mp.config.PartitionId, r, stack))
}
}()
if len(inoSlice) == 0 {
return
}
log.LogDebugf("[deleteMarkedInodes] . mp[%v] inoSlice [%v]", mp.config.PartitionId, inoSlice)
shouldCommit := make([]*Inode, 0, DeleteBatchCount())
shouldRePushToFreeList := make([]*Inode, 0)
deleteExtentsByPartition := make(map[uint64][]*proto.ExtentKey)
allInodes := make([]*Inode, 0)
for _, ino := range inoSlice {
ref := &Inode{Inode: ino}
inode, ok := mp.inodeTree.Get(ref).(*Inode)
if !ok {
log.LogDebugf("[deleteMarkedInodes] . mp[%v] inode[%v] not found", mp.config.PartitionId, ino)
continue
}
if !inode.ShouldDelete() {
log.LogWarnf("[deleteMarkedInodes] : inode should not be deleted, ino %s", inode.String())
continue
}
log.LogDebugf("[deleteMarkedInodes] . mp[%v] inode[%v] inode.Extents: %v, ino verList: %v",
mp.config.PartitionId, ino, inode.Extents, inode.GetMultiVerString())
if inode.getLayerLen() > 0 {
log.LogErrorf("[deleteMarkedInodes] deleteMarkedInodes. mp[%v] inode[%v] verlist len %v should not drop",
mp.config.PartitionId, ino, inode.getLayerLen())
return
}
extInfo := inode.GetAllExtsOfflineInode(mp.config.PartitionId)
for dpID, inodeExts := range extInfo {
exts, ok := deleteExtentsByPartition[dpID]
if !ok {
exts = make([]*proto.ExtentKey, 0)
}
exts = append(exts, inodeExts...)
log.LogWritef("[deleteMarkedInodes] mp[%v] ino(%v) deleteExtent(%v)", mp.config.PartitionId, inode.Inode, len(inodeExts))
deleteExtentsByPartition[dpID] = exts
}
allInodes = append(allInodes, inode)
}
if proto.IsCold(mp.volType) {
// delete ebs obj extents
shouldCommit, shouldRePushToFreeList = mp.doBatchDeleteObjExtentsInEBS(allInodes)
log.LogInfof("[deleteMarkedInodes] metaPartition(%v) deleteInodeCnt(%d) shouldRePush(%d)",
mp.config.PartitionId, len(shouldCommit), len(shouldRePushToFreeList))
for _, inode := range shouldRePushToFreeList {
mp.freeList.Push(inode.Inode)
}
allInodes = shouldCommit
}
log.LogInfof("[deleteMarkedInodes] metaPartition(%v) deleteExtentsByPartition(%v) allInodes(%v)",
mp.config.PartitionId, deleteExtentsByPartition, allInodes)
shouldCommit, shouldRePushToFreeList = mp.batchDeleteExtentsByPartition(deleteExtentsByPartition, allInodes)
bufSlice := make([]byte, 0, 8*len(shouldCommit))
for _, inode := range shouldCommit {
bufSlice = append(bufSlice, inode.MarshalKey()...)
}
err := mp.syncToRaftFollowersFreeInode(bufSlice)
if err != nil {
log.LogWarnf("[deleteMarkedInodes] raft commit inode list: %v, "+
"response %s", shouldCommit, err.Error())
}
for _, inode := range shouldCommit {
if err == nil {
mp.internalDeleteInode(inode)
} else {
mp.freeList.Push(inode.Inode)
}
}
log.LogInfof("[deleteMarkedInodes] metaPartition(%v) deleteInodeCnt(%v) inodeCnt(%v)", mp.config.PartitionId, len(shouldCommit), mp.inodeTree.Len())
for _, inode := range shouldRePushToFreeList {
mp.freeList.Push(inode.Inode)
}
// try again.
if len(shouldRePushToFreeList) > 0 && deleteWorkerSleepMs == 0 {
time.Sleep(time.Duration(1000) * time.Millisecond)
}
}
func (mp *metaPartition) syncToRaftFollowersFreeInode(hasDeleteInodes []byte) (err error) {
if len(hasDeleteInodes) == 0 {
return
}
_, err = mp.submit(opFSMInternalDeleteInode, hasDeleteInodes)
return
}
func (mp *metaPartition) notifyRaftFollowerToFreeInodes(wg *sync.WaitGroup, target string, hasDeleteInodes []byte) (err error) {
var conn *net.TCPConn
conn, err = mp.config.ConnPool.GetConnect(target)
defer func() {
wg.Done()
if err != nil {
log.LogWarnf(err.Error())
mp.config.ConnPool.PutConnect(conn, ForceClosedConnect)
} else {
mp.config.ConnPool.PutConnect(conn, NoClosedConnect)
}
}()
if err != nil {
return
}
request := NewPacketToFreeInodeOnRaftFollower(mp.config.PartitionId, hasDeleteInodes)
if err = request.WriteToConn(conn); err != nil {
return
}
if err = request.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
return
}
if request.ResultCode != proto.OpOk {
err = fmt.Errorf("request(%v) error(%v)", request.GetUniqueLogId(), string(request.Data[:request.Size]))
}
return
}
func (mp *metaPartition) doDeleteMarkedInodes(ext *proto.ExtentKey) (err error) {
// get the data node view
dp := mp.vol.GetPartition(ext.PartitionId)
log.LogDebugf("action[doDeleteMarkedInodes] dp(%v) status (%v)", dp.PartitionID, dp.Status)
if dp == nil {
if proto.IsCold(mp.volType) {
log.LogInfof("[doDeleteMarkedInodes] ext(%s) is already been deleted, not delete any more", ext.String())
return
}
err = errors.NewErrorf("unknown dataPartitionID=%d in vol",
ext.PartitionId)
return
}
// delete the data node
if len(dp.Hosts) < 1 {
log.LogErrorf("doBatchDeleteExtentsByPartition dp id(%v) is invalid, detail[%v]", ext.PartitionId, dp)
err = errors.NewErrorf("dp id(%v) is invalid", ext.PartitionId)
return
}
// NOTE: if all replicas in dp is dead
// skip send request to dp leader
if dp.Status == proto.Unavailable {
return
}
addr := util.ShiftAddrPort(dp.Hosts[0], smuxPortShift)
conn, err := smuxPool.GetConnect(addr)
log.LogInfof("doDeleteMarkedInodes mp (%v) GetConnect (%v), ext(%s)", mp.config.PartitionId, addr, ext.String())
defer func() {
smuxPool.PutConnect(conn, ForceClosedConnect)
log.LogInfof("doDeleteMarkedInodes mp (%v) PutConnect (%v), ext(%s)", mp.config.PartitionId, addr, ext.String())
}()
if err != nil {
err = errors.NewErrorf("get conn from pool %s, "+
"extent(%s))",
err.Error(), ext.String())
return
}
var (
p *Packet
invalid bool
)
if p, invalid = NewPacketToDeleteExtent(dp, ext); invalid {
p.ResultCode = proto.OpOk
return
}
if err = p.WriteToConn(conn); err != nil {
err = errors.NewErrorf("write to dataNode %s, %s", p.GetUniqueLogId(),
err.Error())
return
}
if err = p.ReadFromConnWithVer(conn, proto.ReadDeadlineTime); err != nil {
err = errors.NewErrorf("read response from dataNode %s, %s",
p.GetUniqueLogId(), err.Error())
return
}
if p.ResultCode == proto.OpTryOtherAddr && proto.IsCold(mp.volType) {
log.LogInfof("[doBatchDeleteExtentsByPartition] deleteOp retrun tryOtherAddr code means dp is deleted for LF vol, ext(%s)", ext.String())
return
}
if p.ResultCode != proto.OpOk {
err = errors.NewErrorf("[deleteMarkedInodes] %s response: %s", p.GetUniqueLogId(),
p.GetResultMsg())
}
return
}
func (mp *metaPartition) doBatchDeleteExtentsByPartition(partitionID uint64, exts []*proto.ExtentKey) (err error) {
// get the data node view
dp := mp.vol.GetPartition(partitionID)
if dp == nil {
if proto.IsCold(mp.volType) {
log.LogInfof("[doBatchDeleteExtentsByPartition] dp(%d) is already been deleted, not delete any more", partitionID)
return
}
err = errors.NewErrorf("unknown dataPartitionID=%d in vol",
partitionID)
return
}
for _, ext := range exts {
if ext.PartitionId != partitionID {
err = errors.NewErrorf("BatchDeleteExtent do batchDelete on PartitionID(%v) but unexpect Extent(%v)", partitionID, ext)
return
}
}
// delete the data node
if len(dp.Hosts) < 1 {
log.LogErrorf("doBatchDeleteExtentsByPartition dp id(%v) is invalid, detail[%v]", partitionID, dp)
err = errors.NewErrorf("dp id(%v) is invalid", partitionID)
return
}
addr := util.ShiftAddrPort(dp.Hosts[0], smuxPortShift)
conn, err := smuxPool.GetConnect(addr)
log.LogInfof("doBatchDeleteExtentsByPartition mp (%v) GetConnect (%v)", mp.config.PartitionId, addr)
ResultCode := proto.OpOk
defer func() {
smuxPool.PutConnect(conn, ForceClosedConnect)
log.LogInfof("doBatchDeleteExtentsByPartition mp (%v) PutConnect (%v)", mp.config.PartitionId, addr)
}()
if err != nil {
err = errors.NewErrorf("get conn from pool %s, "+
"extents partitionId=%d",
err.Error(), partitionID)
return
}
p := NewPacketToBatchDeleteExtent(dp, exts)
if err = p.WriteToConn(conn); err != nil {
err = errors.NewErrorf("write to dataNode %s, %s", p.GetUniqueLogId(),
err.Error())
return
}
if err = p.ReadFromConnWithVer(conn, proto.BatchDeleteExtentReadDeadLineTime); err != nil {
err = errors.NewErrorf("read response from dataNode %s, %s",
p.GetUniqueLogId(), err.Error())
return
}
ResultCode = p.ResultCode
if ResultCode == proto.OpTryOtherAddr && proto.IsCold(mp.volType) {
log.LogInfof("[doBatchDeleteExtentsByPartition] deleteOp retrun tryOtherAddr code means dp is deleted for LF vol, dp(%d)", partitionID)
return
}
if p.ResultCode != proto.OpOk {
err = errors.NewErrorf("[deleteMarkedInodes] %s response: %s", p.GetUniqueLogId(),
p.GetResultMsg())
}
return
}
const maxDelCntOnce = 512
func (mp *metaPartition) doBatchDeleteObjExtentsInEBS(allInodes []*Inode) (shouldCommit []*Inode, shouldPushToFreeList []*Inode) {
shouldCommit = make([]*Inode, 0, len(allInodes))
shouldPushToFreeList = make([]*Inode, 0)
var (
wg sync.WaitGroup
lock sync.Mutex
)
for _, inode := range allInodes {
wg.Add(1)
inode.RLock()
inode.ObjExtents.RLock()
go func(ino *Inode, oeks []proto.ObjExtentKey) {
defer wg.Done()
log.LogDebugf("[doBatchDeleteObjExtentsInEBS] ino(%d) delObjEks[%d]", ino.Inode, len(oeks))
err := mp.deleteObjExtents(oeks)
lock.Lock()
if err != nil {
shouldPushToFreeList = append(shouldPushToFreeList, ino)
log.LogErrorf("[doBatchDeleteObjExtentsInEBS] delete ebs eks fail, ino(%d), cnt(%d), err(%s)", ino.Inode, len(oeks), err.Error())
} else {
shouldCommit = append(shouldCommit, ino)
}
lock.Unlock()
ino.ObjExtents.RUnlock()
ino.RUnlock()
}(inode, inode.ObjExtents.eks)
}
wg.Wait()
return
}
func (mp *metaPartition) deleteObjExtents(oeks []proto.ObjExtentKey) (err error) {
total := len(oeks)
for i := 0; i < total; i += maxDelCntOnce {
max := util.Min(i+maxDelCntOnce, total)
err = mp.ebsClient.Delete(oeks[i:max])
if err != nil {
log.LogErrorf("[deleteObjExtents] delete ebs eks fail, cnt(%d), err(%s)", max-i, err.Error())
return err
}
}
return err
}
func (mp *metaPartition) recycleInodeDelFile() {
// NOTE: get all files
dentries, err := os.ReadDir(mp.config.RootDir)
if err != nil {
log.LogErrorf("[recycleInodeDelFile] mp(%v) failed to read dir(%v)", mp.config.PartitionId, mp.config.RootDir)
return
}
inodeDelFiles := make([]string, 0)
for _, dentry := range dentries {
if strings.HasPrefix(dentry.Name(), DeleteInodeFileExtension) && strings.HasSuffix(dentry.Name(), ".old") {
inodeDelFiles = append(inodeDelFiles, dentry.Name())
}
}
// NOTE: sort files
sort.Slice(inodeDelFiles, func(i, j int) bool {
// NOTE: date format satisfies dictionary order
return inodeDelFiles[i] < inodeDelFiles[j]
})
// NOTE: check disk space and recycle files
for len(inodeDelFiles) > 0 {
diskSpaceLeft := int64(0)
stat, err := fileutil.Statfs(mp.config.RootDir)
if err != nil {
log.LogErrorf("[recycleInodeDelFile] mp(%v) failed to get fs info", mp.config.PartitionId)
return
}
diskSpaceLeft = int64(stat.Bavail * uint64(stat.Bsize))
if diskSpaceLeft >= 50*util.GB && len(inodeDelFiles) < 5 {
log.LogDebugf("[recycleInodeDelFile] mp(%v) not need to recycle, return", mp.config.PartitionId)
return
}
// NOTE: delete a file and pop an item
oldestFile := inodeDelFiles[len(inodeDelFiles)-1]
inodeDelFiles = inodeDelFiles[:len(inodeDelFiles)-1]
err = os.Remove(oldestFile)
if err != nil {
log.LogErrorf("[recycleInodeDelFile] mp(%v) failed to remove file(%v)", mp.config.PartitionId, oldestFile)
return
}
}
}
func (mp *metaPartition) persistDeletedInode(ino uint64, currentSize *uint64) {
if *currentSize >= DeleteInodeFileRollingSize {
fileName := fmt.Sprintf("%v.%v.%v", DeleteInodeFileExtension, time.Now().Format(log.FileNameDateFormat), "old")
if err := mp.delInodeFp.Sync(); err != nil {
log.LogErrorf("[persistDeletedInode] failed to sync delete inode file, err(%v), inode(%v)", err, ino)
return
}
mp.delInodeFp.Close()
mp.delInodeFp = nil
// NOTE: that is ok, if rename fails
// we will re-open it in next line
fileName = path.Join(mp.config.RootDir, fileName)
err := os.Rename(path.Join(mp.config.RootDir, DeleteInodeFileExtension), fileName)
if err != nil {
log.LogErrorf("[persistDeletedInode] failed to rename delete inode file, err(%v)", err)
} else {
*currentSize = 0
mp.recycleInodeDelFile()
}
if err = mp.openDeleteInodeFile(); err != nil {
log.LogErrorf("[persistDeletedInode] failed to open delete inode file, err(%v), inode(%v)", err, ino)
return
}
}
// NOTE: += sizeof(uint64)
*currentSize += 8
if _, err := mp.delInodeFp.WriteString(fmt.Sprintf("%v\n", ino)); err != nil {
log.LogErrorf("[persistDeletedInode] failed to persist ino(%v), err(%v)", ino, err)
return
}
}
func (mp *metaPartition) persistDeletedInodes(inos []uint64) {
log.LogDebugf("persistDeletedInodes. inos [%v]", inos)
if mp.delInodeFp == nil {
// NOTE: hope it can re-open file
if err := mp.openDeleteInodeFile(); err != nil {
log.LogErrorf("[persistDeletedInodes] delete inode file is not open, err(%v), inodes(%v)", err, inos)
return
}
log.LogWarnf("[persistDeletedInodes] re-open file success")
}
info, err := mp.delInodeFp.Stat()
if err != nil {
log.LogErrorf("[persistDeletedInodes] failed to get size of delete inode file, err(%v), inodes(%v)", err, inos)
return
}
currSize := uint64(info.Size())
for _, ino := range inos {
mp.persistDeletedInode(ino, &currSize)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io"
"math"
"net"
"os"
"path"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
// Apply applies the given operational commands.
func (mp *metaPartition) Apply(command []byte, index uint64) (resp interface{}, err error) {
msg := &MetaItem{}
defer func() {
if err == nil {
mp.uploadApplyID(index)
}
}()
if err = msg.UnmarshalJson(command); err != nil {
return
}
mp.nonIdempotent.Lock()
defer mp.nonIdempotent.Unlock()
switch msg.Op {
case opFSMCreateInode:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
if mp.config.Cursor < ino.Inode {
mp.config.Cursor = ino.Inode
}
resp = mp.fsmCreateInode(ino)
case opFSMCreateInodeQuota:
qinode := &MetaQuotaInode{}
if err = qinode.Unmarshal(msg.V); err != nil {
return
}
ino := qinode.inode
if mp.config.Cursor < ino.Inode {
mp.config.Cursor = ino.Inode
}
if len(qinode.quotaIds) > 0 {
mp.setInodeQuota(qinode.quotaIds, ino.Inode)
}
resp = mp.fsmCreateInode(ino)
if resp == proto.OpOk {
for _, quotaId := range qinode.quotaIds {
mp.mqMgr.updateUsedInfo(0, 1, quotaId)
}
}
case opFSMUnlinkInode:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
status := mp.inodeInTx(ino.Inode)
if status != proto.OpOk {
resp = &InodeResponse{Status: status}
return
}
resp = mp.fsmUnlinkInode(ino, 0)
case opFSMUnlinkInodeOnce:
var inoOnce *InodeOnce
if inoOnce, err = InodeOnceUnmarshal(msg.V); err != nil {
return
}
ino := NewInode(inoOnce.Inode, 0)
ino.setVer(inoOnce.VerSeq)
resp = mp.fsmUnlinkInode(ino, inoOnce.UniqID)
case opFSMUnlinkInodeBatch:
inodes, err := InodeBatchUnmarshal(msg.V)
if err != nil {
return nil, err
}
resp = mp.fsmUnlinkInodeBatch(inodes)
case opFSMExtentTruncate:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmExtentsTruncate(ino)
case opFSMCreateLinkInode:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
status := mp.inodeInTx(ino.Inode)
if status != proto.OpOk {
resp = &InodeResponse{Status: status}
return
}
resp = mp.fsmCreateLinkInode(ino, 0)
case opFSMCreateLinkInodeOnce:
var inoOnce *InodeOnce
if inoOnce, err = InodeOnceUnmarshal(msg.V); err != nil {
return
}
ino := NewInode(inoOnce.Inode, 0)
resp = mp.fsmCreateLinkInode(ino, inoOnce.UniqID)
case opFSMEvictInode:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
status := mp.inodeInTx(ino.Inode)
if status != proto.OpOk {
resp = &InodeResponse{Status: status}
return
}
resp = mp.fsmEvictInode(ino)
case opFSMEvictInodeBatch:
inodes, err := InodeBatchUnmarshal(msg.V)
if err != nil {
return nil, err
}
resp = mp.fsmBatchEvictInode(inodes)
case opFSMSetAttr:
req := &SetattrRequest{}
err = json.Unmarshal(msg.V, req)
if err != nil {
return
}
err = mp.fsmSetAttr(req)
case opFSMCreateDentry:
den := &Dentry{}
if err = den.Unmarshal(msg.V); err != nil {
return
}
status := mp.dentryInTx(den.ParentId, den.Name)
if status != proto.OpOk {
resp = status
return
}
resp = mp.fsmCreateDentry(den, false)
case opFSMDeleteDentry:
den := &Dentry{}
if err = den.Unmarshal(msg.V); err != nil {
return
}
status := mp.dentryInTx(den.ParentId, den.Name)
if status != proto.OpOk {
resp = status
return
}
resp = mp.fsmDeleteDentry(den, false)
case opFSMDeleteDentryBatch:
db, err := DentryBatchUnmarshal(msg.V)
if err != nil {
return nil, err
}
resp = mp.fsmBatchDeleteDentry(db)
case opFSMUpdateDentry:
den := &Dentry{}
if err = den.Unmarshal(msg.V); err != nil {
return
}
status := mp.dentryInTx(den.ParentId, den.Name)
if status != proto.OpOk {
resp = &DentryResponse{Status: status}
return
}
resp = mp.fsmUpdateDentry(den)
case opFSMUpdatePartition:
req := &UpdatePartitionReq{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp, err = mp.fsmUpdatePartition(req.End)
case opFSMExtentsAdd:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmAppendExtents(ino)
case opFSMExtentsAddWithCheck:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmAppendExtentsWithCheck(ino, false)
case opFSMExtentSplit:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmAppendExtentsWithCheck(ino, true)
case opFSMObjExtentsAdd:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmAppendObjExtents(ino)
case opFSMExtentsEmpty:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmExtentsEmpty(ino)
case opFSMClearInodeCache:
ino := NewInode(0, 0)
if err = ino.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmClearInodeCache(ino)
case opFSMSentToChan:
resp = mp.fsmSendToChan(msg.V, true)
case opFSMStoreTick:
inodeTree := mp.inodeTree.GetTree()
dentryTree := mp.dentryTree.GetTree()
extendTree := mp.extendTree.GetTree()
multipartTree := mp.multipartTree.GetTree()
txTree := mp.txProcessor.txManager.txTree.GetTree()
txRbInodeTree := mp.txProcessor.txResource.txRbInodeTree.GetTree()
txRbDentryTree := mp.txProcessor.txResource.txRbDentryTree.GetTree()
txId := mp.txProcessor.txManager.txIdAlloc.getTransactionID()
quotaRebuild := mp.mqMgr.statisticRebuildStart()
uidRebuild := mp.acucumRebuildStart()
uniqChecker := mp.uniqChecker.clone()
msg := &storeMsg{
command: opFSMStoreTick,
applyIndex: index,
txId: txId,
inodeTree: inodeTree,
dentryTree: dentryTree,
extendTree: extendTree,
multipartTree: multipartTree,
txTree: txTree,
txRbInodeTree: txRbInodeTree,
txRbDentryTree: txRbDentryTree,
quotaRebuild: quotaRebuild,
uidRebuild: uidRebuild,
uniqChecker: uniqChecker,
multiVerList: mp.GetAllVerList(),
}
log.LogDebugf("opFSMStoreTick: quotaRebuild [%v] uidRebuild [%v]", quotaRebuild, uidRebuild)
mp.storeChan <- msg
case opFSMInternalDeleteInode:
err = mp.internalDelete(msg.V)
case opFSMInternalDeleteInodeBatch:
err = mp.internalDeleteBatch(msg.V)
case opFSMInternalDelExtentFile:
err = mp.delOldExtentFile(msg.V)
case opFSMInternalDelExtentCursor:
err = mp.setExtentDeleteFileCursor(msg.V)
case opFSMSetXAttr:
var extend *Extend
if extend, err = NewExtendFromBytes(msg.V); err != nil {
return
}
err = mp.fsmSetXAttr(extend)
case opFSMRemoveXAttr:
var extend *Extend
if extend, err = NewExtendFromBytes(msg.V); err != nil {
return
}
err = mp.fsmRemoveXAttr(extend)
case opFSMUpdateXAttr:
var extend *Extend
if extend, err = NewExtendFromBytes(msg.V); err != nil {
return
}
err = mp.fsmSetXAttr(extend)
case opFSMCreateMultipart:
var multipart *Multipart
multipart = MultipartFromBytes(msg.V)
resp = mp.fsmCreateMultipart(multipart)
case opFSMRemoveMultipart:
var multipart *Multipart
multipart = MultipartFromBytes(msg.V)
resp = mp.fsmRemoveMultipart(multipart)
case opFSMAppendMultipart:
var multipart *Multipart
multipart = MultipartFromBytes(msg.V)
resp = mp.fsmAppendMultipart(multipart)
case opFSMSyncCursor:
var cursor uint64
cursor = binary.BigEndian.Uint64(msg.V)
if cursor > mp.config.Cursor {
mp.config.Cursor = cursor
}
case opFSMSyncTxID:
var txID uint64
txID = binary.BigEndian.Uint64(msg.V)
if txID > mp.txProcessor.txManager.txIdAlloc.getTransactionID() {
mp.txProcessor.txManager.txIdAlloc.setTransactionID(txID)
}
case opFSMTxInit:
txInfo := proto.NewTransactionInfo(0, 0)
if err = txInfo.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxInit(txInfo)
case opFSMTxCreateInode:
txIno := NewTxInode(0, 0, nil)
if err = txIno.Unmarshal(msg.V); err != nil {
return
}
if mp.config.Cursor < txIno.Inode.Inode {
mp.config.Cursor = txIno.Inode.Inode
}
resp = mp.fsmTxCreateInode(txIno, []uint32{})
case opFSMTxCreateInodeQuota:
qinode := &TxMetaQuotaInode{}
if err = qinode.Unmarshal(msg.V); err != nil {
return
}
txIno := qinode.txinode
if mp.config.Cursor < txIno.Inode.Inode {
mp.config.Cursor = txIno.Inode.Inode
}
if len(qinode.quotaIds) > 0 {
mp.setInodeQuota(qinode.quotaIds, txIno.Inode.Inode)
}
resp = mp.fsmTxCreateInode(txIno, qinode.quotaIds)
if resp == proto.OpOk {
for _, quotaId := range qinode.quotaIds {
mp.mqMgr.updateUsedInfo(0, 1, quotaId)
}
}
case opFSMTxCreateDentry:
txDen := NewTxDentry(0, "", 0, 0, nil, nil)
if err = txDen.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxCreateDentry(txDen)
case opFSMTxSetState:
req := &proto.TxSetStateRequest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp = mp.fsmTxSetState(req)
case opFSMTxCommitRM:
req := &proto.TransactionInfo{}
if err = req.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxCommitRM(req)
case opFSMTxRollbackRM:
req := &proto.TransactionInfo{}
if err = req.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxRollbackRM(req)
case opFSMTxCommit:
req := &proto.TxApplyRequest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp = mp.fsmTxCommit(req.TxID)
case opFSMTxRollback:
req := &proto.TxApplyRequest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp = mp.fsmTxRollback(req.TxID)
case opFSMTxDelete:
req := &proto.TxApplyRequest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp = mp.fsmTxDelete(req.TxID)
case opFSMTxDeleteDentry:
txDen := NewTxDentry(0, "", 0, 0, nil, nil)
if err = txDen.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxDeleteDentry(txDen)
case opFSMTxUnlinkInode:
txIno := NewTxInode(0, 0, nil)
if err = txIno.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxUnlinkInode(txIno)
case opFSMTxUpdateDentry:
// txDen := NewTxDentry(0, "", 0, 0, nil)
txUpdateDen := NewTxUpdateDentry(nil, nil, nil)
if err = txUpdateDen.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxUpdateDentry(txUpdateDen)
case opFSMTxCreateLinkInode:
txIno := NewTxInode(0, 0, nil)
if err = txIno.Unmarshal(msg.V); err != nil {
return
}
resp = mp.fsmTxCreateLinkInode(txIno)
case opFSMSetInodeQuotaBatch:
req := &proto.BatchSetMetaserverQuotaReuqest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp = mp.fsmSetInodeQuotaBatch(req)
case opFSMDeleteInodeQuotaBatch:
req := &proto.BatchDeleteMetaserverQuotaReuqest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
resp = mp.fsmDeleteInodeQuotaBatch(req)
case opFSMUniqID:
resp = mp.fsmUniqID(msg.V)
case opFSMUniqCheckerEvict:
req := &fsmEvictUniqCheckerRequest{}
if err = json.Unmarshal(msg.V, req); err != nil {
return
}
err = mp.fsmUniqCheckerEvict(req)
case opFSMVersionOp:
err = mp.fsmVersionOp(msg.V)
default:
// do nothing
}
return
}
func (mp *metaPartition) runVersionOp() {
mp.verUpdateChan = make(chan []byte, 100)
for {
select {
case verData := <-mp.verUpdateChan:
mp.submit(opFSMVersionOp, verData)
case <-mp.stopC:
log.LogWarnf("runVersionOp exit!")
return
}
}
}
func (mp *metaPartition) fsmVersionOp(reqData []byte) (err error) {
mp.multiVersionList.RWLock.Lock()
defer mp.multiVersionList.RWLock.Unlock()
var opData VerOpData
if err = json.Unmarshal(reqData, &opData); err != nil {
log.LogErrorf("action[fsmVersionOp] mp[%v] unmarshal error %v", mp.config.PartitionId, err)
return
}
log.LogInfof("action[fsmVersionOp] volname [%v] mp[%v] seq [%v], op [%v]", mp.config.VolName, mp.config.PartitionId, opData.VerSeq, opData.Op)
if opData.Op == proto.CreateVersionPrepare {
cnt := len(mp.multiVersionList.VerList)
if cnt > 0 {
lastVersion := mp.multiVersionList.VerList[cnt-1]
if lastVersion.Ver > opData.VerSeq {
log.LogWarnf("action[HandleVersionOp] createVersionPrepare reqeust seq [%v] less than last exist snapshot seq [%v]", opData.VerSeq, lastVersion.Ver)
return
} else if lastVersion.Ver == opData.VerSeq {
log.LogWarnf("action[HandleVersionOp] CreateVersionPrepare request seq [%v] already exist status [%v]", opData.VerSeq, lastVersion.Status)
return
}
}
newVer := &proto.VolVersionInfo{
Status: proto.VersionPrepare,
Ver: opData.VerSeq,
}
mp.verSeq = opData.VerSeq
mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, newVer)
log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], seqArray size %v", mp.config.PartitionId, opData.VerSeq, opData.Op, len(mp.multiVersionList.VerList))
} else if opData.Op == proto.CreateVersionCommit {
cnt := len(mp.multiVersionList.VerList)
if cnt > 0 {
if mp.multiVersionList.VerList[cnt-1].Ver > opData.VerSeq {
log.LogWarnf("action[fsmVersionOp] mp[%v] reqeust seq [%v] less than last exist snapshot seq [%v]", mp.config.PartitionId,
opData.VerSeq, mp.multiVersionList.VerList[cnt-1].Ver)
return
}
if mp.multiVersionList.VerList[cnt-1].Ver == opData.VerSeq {
if mp.multiVersionList.VerList[cnt-1].Status != proto.VersionPrepare {
log.LogWarnf("action[fsmVersionOp] mp[%v] reqeust seq [%v] Equal last exist snapshot seq [%v] but with status [%v]", mp.config.PartitionId,
mp.multiVersionList.VerList[cnt-1].Ver, opData.VerSeq, mp.multiVersionList.VerList[cnt-1].Status)
}
mp.multiVersionList.VerList[cnt-1].Status = proto.VersionNormal
return
}
}
newVer := &proto.VolVersionInfo{
Status: proto.VersionNormal,
Ver: opData.VerSeq,
}
mp.verSeq = opData.VerSeq
mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, newVer)
log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], seqArray size %v", mp.config.PartitionId, opData.VerSeq, opData.Op, len(mp.multiVersionList.VerList))
} else if opData.Op == proto.DeleteVersion {
for i, ver := range mp.multiVersionList.VerList {
if i == len(mp.multiVersionList.VerList)-1 {
log.LogWarnf("action[fsmVersionOp] mp[%v] seq [%v], op [%v], seqArray size %v newest ver [%v] reque ver [%v]",
mp.config.PartitionId, opData.VerSeq, opData.Op, len(mp.multiVersionList.VerList), ver.Ver, opData.VerSeq)
break
}
if ver.Ver == opData.VerSeq {
log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], VerList %v",
mp.config.PartitionId, opData.VerSeq, opData.Op, mp.multiVersionList.VerList)
// mp.multiVersionList = append(mp.multiVersionList[:i], mp.multiVersionList[i+1:]...)
mp.multiVersionList.VerList = append(mp.multiVersionList.VerList[:i], mp.multiVersionList.VerList[i+1:]...)
log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], VerList %v",
mp.config.PartitionId, opData.VerSeq, opData.Op, mp.multiVersionList.VerList)
break
}
}
} else if opData.Op == proto.SyncBatchVersionList {
log.LogInfof("action[fsmVersionOp] mp[%v] before update:with seq [%v] verlist %v opData.VerList %v",
mp.config.PartitionId, mp.verSeq, mp.multiVersionList.VerList, opData.VerList)
lastVer := mp.multiVersionList.GetLastVer()
for _, info := range opData.VerList {
if info.Ver > lastVer {
mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, info)
log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] after update:with seq [%v] verlist %v",
mp.config.PartitionId, mp.verSeq, mp.multiVersionList.VerList)
}
}
mp.verSeq = mp.multiVersionList.GetLastVer()
log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] after update:with seq [%v] verlist %v",
mp.config.PartitionId, mp.verSeq, mp.multiVersionList.VerList)
} else {
log.LogErrorf("action[fsmVersionOp] mp[%v] with seq [%v] process op type %v seq [%v] not found",
mp.config.PartitionId, mp.verSeq, opData.Op, opData.VerSeq)
}
return
}
// ApplyMemberChange apply changes to the raft member.
func (mp *metaPartition) ApplyMemberChange(confChange *raftproto.ConfChange, index uint64) (resp interface{}, err error) {
defer func() {
if err == nil {
mp.uploadApplyID(index)
}
}()
// change memory status
var (
updated bool
)
switch confChange.Type {
case raftproto.ConfAddNode:
req := &proto.AddMetaPartitionRaftMemberRequest{}
if err = json.Unmarshal(confChange.Context, req); err != nil {
return
}
updated, err = mp.confAddNode(req, index)
case raftproto.ConfRemoveNode:
req := &proto.RemoveMetaPartitionRaftMemberRequest{}
if err = json.Unmarshal(confChange.Context, req); err != nil {
return
}
updated, err = mp.confRemoveNode(req, index)
case raftproto.ConfUpdateNode:
// updated, err = mp.confUpdateNode(req, index)
default:
// do nothing
}
if err != nil {
return
}
if updated {
mp.config.sortPeers()
if err = mp.persistMetadata(); err != nil {
log.LogErrorf("action[ApplyMemberChange] err[%v].", err)
return
}
}
return
}
// Snapshot returns the snapshot of the current meta partition.
func (mp *metaPartition) Snapshot() (snap raftproto.Snapshot, err error) {
snap, err = newMetaItemIterator(mp)
return
}
func (mp *metaPartition) ApplySnapshot(peers []raftproto.Peer, iter raftproto.SnapIterator) (err error) {
var (
data []byte
index int
appIndexID uint64
txID uint64
uniqID uint64
cursor uint64
inodeTree = NewBtree()
dentryTree = NewBtree()
extendTree = NewBtree()
multipartTree = NewBtree()
txTree = NewBtree()
txRbInodeTree = NewBtree()
txRbDentryTree = NewBtree()
uniqChecker = newUniqChecker()
verList []*proto.VolVersionInfo
)
blockUntilStoreSnapshot := func() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
log.LogWarnf("ApplySnapshot: start to block until store snapshot to disk, mp[%v], appid %d", mp.config.PartitionId, appIndexID)
start := time.Now()
for {
select {
case <-ticker.C:
if time.Since(start) > time.Minute*20 {
msg := fmt.Sprintf("ApplySnapshot: wait store snapshot timeout after 20 minutes, mp %d, appId %d, storeId %d",
mp.config.PartitionId, appIndexID, mp.storedApplyId)
log.LogErrorf(msg)
err = fmt.Errorf(msg)
return
}
msg := fmt.Sprintf("ApplySnapshot: start check storedApplyId, mp %d appId %d, storeAppId %d, cost %s",
mp.config.PartitionId, appIndexID, mp.storedApplyId, time.Since(start).String())
if time.Since(start) > time.Minute {
log.LogWarnf("still block after one minute, msg %s", msg)
} else {
log.LogInfo(msg)
}
if mp.storedApplyId >= appIndexID {
log.LogWarnf("ApplySnapshot: store snapshot success, msg %s", msg)
return
}
case <-mp.stopC:
log.LogWarnf("ApplySnapshot: revice stop signal, exit now, partition(%d), applyId(%d)", mp.config.PartitionId, mp.applyID)
err = errors.New("server has been shutdown when block")
return
}
}
}
defer func() {
if err == io.EOF {
mp.applyID = appIndexID
mp.config.UniqId = uniqID
mp.txProcessor.txManager.txIdAlloc.setTransactionID(txID)
mp.inodeTree = inodeTree
mp.dentryTree = dentryTree
mp.extendTree = extendTree
mp.multipartTree = multipartTree
mp.config.Cursor = cursor
mp.txProcessor.txManager.txTree = txTree
mp.txProcessor.txResource.txRbInodeTree = txRbInodeTree
mp.txProcessor.txResource.txRbDentryTree = txRbDentryTree
mp.uniqChecker = uniqChecker
mp.multiVersionList.VerList = make([]*proto.VolVersionInfo, len(verList))
copy(mp.multiVersionList.VerList, verList)
mp.verSeq = mp.multiVersionList.GetLastVer()
log.LogInfof("mp[%v] updateVerList (%v) seq [%v]", mp.config.PartitionId, mp.multiVersionList.VerList, mp.verSeq)
err = nil
// store message
mp.storeChan <- &storeMsg{
command: opFSMStoreTick,
applyIndex: mp.applyID,
txId: mp.txProcessor.txManager.txIdAlloc.getTransactionID(),
inodeTree: mp.inodeTree.GetTree(),
dentryTree: mp.dentryTree.GetTree(),
extendTree: mp.extendTree.GetTree(),
multipartTree: mp.multipartTree.GetTree(),
txTree: mp.txProcessor.txManager.txTree.GetTree(),
txRbInodeTree: mp.txProcessor.txResource.txRbInodeTree.GetTree(),
txRbDentryTree: mp.txProcessor.txResource.txRbDentryTree.GetTree(),
uniqChecker: uniqChecker.clone(),
multiVerList: mp.GetVerList(),
}
select {
case mp.extReset <- struct{}{}:
log.LogDebugf("ApplySnapshot: finish with EOF: partitionID(%v) applyID(%v), txID(%v), uniqID(%v), cursor(%v)",
mp.config.PartitionId, mp.applyID, mp.txProcessor.txManager.txIdAlloc.getTransactionID(), mp.config.UniqId, mp.config.Cursor)
blockUntilStoreSnapshot()
return
case <-mp.stopC:
log.LogWarnf("ApplySnapshot: revice stop signal, exit now, partition(%d), applyId(%d)", mp.config.PartitionId, mp.applyID)
err = errors.New("server has been shutdown")
return
}
}
log.LogErrorf("ApplySnapshot: stop with error: partitionID(%v) err(%v)", mp.config.PartitionId, err)
}()
var leaderSnapFormatVer uint32
leaderSnapFormatVer = math.MaxUint32
for {
data, err = iter.Next()
if err != nil {
return
}
if index == 0 {
appIndexID = binary.BigEndian.Uint64(data)
log.LogDebugf("ApplySnapshot: partitionID(%v), temporary uint64 appIndexID:%v", mp.config.PartitionId, appIndexID)
}
snap := NewMetaItem(0, nil, nil)
if err = snap.UnmarshalBinary(data); err != nil {
if index == 0 {
// for compatibility, if leader send snapshot format int version_0, index=0 is applyId in uint64 and
// will cause snap.UnmarshalBinary err, then just skip index=0 and continue with the other fields
log.LogInfof("ApplySnapshot: snap.UnmarshalBinary failed in index=0, partitionID(%v), assuming snapshot format version_0",
mp.config.PartitionId)
index++
leaderSnapFormatVer = SnapFormatVersion_0
continue
}
log.LogInfof("ApplySnapshot: snap.UnmarshalBinary failed, partitionID(%v) index(%v)", mp.config.PartitionId, index)
err = errors.New("unmarshal snap data failed")
return
}
if index == 0 {
if snap.Op != opFSMSnapFormatVersion {
// check whether the snapshot format matches, if snap.UnmarshalBinary has no err for index 0, it should be opFSMSnapFormatVersion
err = fmt.Errorf("ApplySnapshot: snapshot format not match, partitionID(%v), index:%v, expect snap.Op:%v, actual snap.Op:%v",
mp.config.PartitionId, index, opFSMSnapFormatVersion, snap.Op)
log.LogWarn(err.Error())
return
}
// check whether the snapshot format version number matches
leaderSnapFormatVer = binary.BigEndian.Uint32(snap.V)
if leaderSnapFormatVer != mp.manager.metaNode.raftSyncSnapFormatVersion {
log.LogWarnf("ApplySnapshot: snapshot format not match, partitionID(%v), index:%v, expect ver:%v, actual ver:%v",
mp.config.PartitionId, index, mp.manager.metaNode.raftSyncSnapFormatVersion, leaderSnapFormatVer)
}
index++
continue
}
index++
switch snap.Op {
case opFSMApplyId:
appIndexID = binary.BigEndian.Uint64(snap.V)
log.LogDebugf("ApplySnapshot: partitionID(%v) appIndexID:%v", mp.config.PartitionId, appIndexID)
case opFSMTxId:
txID = binary.BigEndian.Uint64(snap.V)
log.LogDebugf("ApplySnapshot: partitionID(%v) txID:%v", mp.config.PartitionId, txID)
case opFSMCursor:
cursor = binary.BigEndian.Uint64(snap.V)
log.LogDebugf("ApplySnapshot: partitionID(%v) cursor:%v", mp.config.PartitionId, cursor)
case opFSMUniqIDSnap:
uniqID = binary.BigEndian.Uint64(snap.V)
log.LogDebugf("ApplySnapshot: partitionID(%v) uniqId:%v", mp.config.PartitionId, uniqID)
case opFSMCreateInode:
ino := NewInode(0, 0)
// TODO Unhandled errors
ino.UnmarshalKey(snap.K)
ino.UnmarshalValue(snap.V)
if cursor < ino.Inode {
cursor = ino.Inode
}
inodeTree.ReplaceOrInsert(ino, true)
log.LogDebugf("ApplySnapshot: create inode: partitonID(%v) inode[%v].", mp.config.PartitionId, ino)
case opFSMCreateDentry:
dentry := &Dentry{}
if err = dentry.UnmarshalKey(snap.K); err != nil {
return
}
if err = dentry.UnmarshalValue(snap.V); err != nil {
return
}
dentryTree.ReplaceOrInsert(dentry, true)
log.LogDebugf("ApplySnapshot: create dentry: partitionID(%v) dentry(%v)", mp.config.PartitionId, dentry)
case opFSMSetXAttr:
var extend *Extend
if extend, err = NewExtendFromBytes(snap.V); err != nil {
return
}
extendTree.ReplaceOrInsert(extend, true)
log.LogDebugf("ApplySnapshot: set extend attributes: partitionID(%v) extend(%v)",
mp.config.PartitionId, extend)
case opFSMCreateMultipart:
multipart := MultipartFromBytes(snap.V)
multipartTree.ReplaceOrInsert(multipart, true)
log.LogDebugf("ApplySnapshot: create multipart: partitionID(%v) multipart(%v)", mp.config.PartitionId, multipart)
case opFSMTxSnapshot:
txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
txInfo.Unmarshal(snap.V)
txTree.ReplaceOrInsert(txInfo, true)
log.LogDebugf("ApplySnapshot: create transaction: partitionID(%v) txInfo(%v)", mp.config.PartitionId, txInfo)
case opFSMTxRbInodeSnapshot:
txRbInode := NewTxRollbackInode(nil, []uint32{}, nil, 0)
txRbInode.Unmarshal(snap.V)
txRbInodeTree.ReplaceOrInsert(txRbInode, true)
log.LogDebugf("ApplySnapshot: create txRbInode: partitionID(%v) txRbinode[%v]", mp.config.PartitionId, txRbInode)
case opFSMTxRbDentrySnapshot:
txRbDentry := NewTxRollbackDentry(nil, nil, 0)
txRbDentry.Unmarshal(snap.V)
txRbDentryTree.ReplaceOrInsert(txRbDentry, true)
log.LogDebugf("ApplySnapshot: create txRbDentry: partitionID(%v) txRbDentry(%v)", mp.config.PartitionId, txRbDentry)
case opFSMVerListSnapShot:
json.Unmarshal(snap.V, &verList)
log.LogDebugf("ApplySnapshot: create verList: partitionID(%v) snap.V(%v) verList(%v)", mp.config.PartitionId, snap.V, verList)
case opExtentFileSnapshot:
fileName := string(snap.K)
fileName = path.Join(mp.config.RootDir, fileName)
if err = os.WriteFile(fileName, snap.V, 0o644); err != nil {
log.LogErrorf("ApplySnapshot: write snap extent delete file fail: partitionID(%v) err(%v)",
mp.config.PartitionId, err)
}
log.LogDebugf("ApplySnapshot: write snap extent delete file: partitonID(%v) filename(%v).",
mp.config.PartitionId, fileName)
case opFSMUniqCheckerSnap:
if err = uniqChecker.UnMarshal(snap.V); err != nil {
log.LogErrorf("ApplyUniqChecker: write snap uniqChecker fail")
return
}
log.LogDebugf("ApplySnapshot: write snap uniqChecker")
default:
if leaderSnapFormatVer != math.MaxUint32 && leaderSnapFormatVer > mp.manager.metaNode.raftSyncSnapFormatVersion {
log.LogWarnf("ApplySnapshot: unknown op=%d, leaderSnapFormatVer:%v, mySnapFormatVer:%v, skip it",
snap.Op, leaderSnapFormatVer, mp.manager.metaNode.raftSyncSnapFormatVersion)
} else {
err = fmt.Errorf("unknown Op=%d", snap.Op)
return
}
}
}
}
// HandleFatalEvent handles the fatal errors.
func (mp *metaPartition) HandleFatalEvent(err *raft.FatalError) {
// Panic while fatal event happen.
exporter.Warning(fmt.Sprintf("action[HandleFatalEvent] err[%v].", err))
log.LogFatalf("action[HandleFatalEvent] err[%v].", err)
panic(err.Err)
}
// HandleLeaderChange handles the leader changes.
func (mp *metaPartition) HandleLeaderChange(leader uint64) {
exporter.Warning(fmt.Sprintf("metaPartition(%v) changeLeader to (%v)", mp.config.PartitionId, leader))
if mp.config.NodeId == leader {
localIp := mp.manager.metaNode.localAddr
if localIp == "" {
localIp = "127.0.0.1"
}
conn, err := net.DialTimeout("tcp", net.JoinHostPort(localIp, serverPort), time.Second)
if err != nil {
log.LogErrorf(fmt.Sprintf("HandleLeaderChange serverPort not exsit ,error %v", err))
exporter.Warning(fmt.Sprintf("mp[%v] HandleLeaderChange serverPort not exsit ,error %v", mp.config.PartitionId, err))
go mp.raftPartition.TryToLeader(mp.config.PartitionId)
return
}
log.LogDebugf("[metaPartition] HandleLeaderChange close conn %v, nodeId: %v, leader: %v", serverPort, mp.config.NodeId, leader)
exporter.Warning(fmt.Sprintf("[metaPartition]mp[%v] HandleLeaderChange close conn %v, nodeId: %v, leader: %v", mp.config.PartitionId, serverPort, mp.config.NodeId, leader))
conn.(*net.TCPConn).SetLinger(0)
conn.Close()
}
if mp.config.NodeId != leader {
log.LogDebugf("[metaPartition] pid: %v HandleLeaderChange become unleader nodeId: %v, leader: %v", mp.config.PartitionId, mp.config.NodeId, leader)
exporter.Warning(fmt.Sprintf("[metaPartition] pid: %v HandleLeaderChange become unleader nodeId: %v, leader: %v", mp.config.PartitionId, mp.config.NodeId, leader))
mp.storeChan <- &storeMsg{
command: stopStoreTick,
}
return
}
mp.storeChan <- &storeMsg{
command: startStoreTick,
}
log.LogDebugf("[metaPartition] pid: %v HandleLeaderChange become leader conn %v, nodeId: %v, leader: %v", mp.config.PartitionId, serverPort, mp.config.NodeId, leader)
exporter.Warning(fmt.Sprintf("[metaPartition] pid: %v HandleLeaderChange become leader conn %v, nodeId: %v, leader: %v", mp.config.PartitionId, serverPort, mp.config.NodeId, leader))
if mp.config.Start == 0 && mp.config.Cursor == 0 {
id, err := mp.nextInodeID()
if err != nil {
log.LogFatalf("[HandleLeaderChange] init root inode id: %s.", err.Error())
exporter.Warning(fmt.Sprintf("[HandleLeaderChange] pid %v init root inode id: %s.", mp.config.PartitionId, err.Error()))
}
ino := NewInode(id, proto.Mode(os.ModePerm|os.ModeDir))
go mp.initInode(ino)
}
}
// Put puts the given key-value pair (operation key and operation request) into the raft store.
func (mp *metaPartition) submit(op uint32, data []byte) (resp interface{}, err error) {
log.LogDebugf("submit. op [%v]", op)
snap := NewMetaItem(0, nil, nil)
snap.Op = op
if data != nil {
snap.V = data
}
cmd, err := snap.MarshalJson()
if err != nil {
return
}
// submit to the raft store
resp, err = mp.raftPartition.Submit(cmd)
log.LogDebugf("submit. op [%v] done", op)
return
}
func (mp *metaPartition) uploadApplyID(applyId uint64) {
atomic.StoreUint64(&mp.applyID, applyId)
}
func (mp *metaPartition) getApplyID() (applyId uint64) {
return atomic.LoadUint64(&mp.applyID)
}
func (mp *metaPartition) getCommittedID() (committedId uint64) {
status := mp.raftPartition.Status()
return status.Commit
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/binary"
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) initInode(ino *Inode) {
for {
time.Sleep(10 * time.Nanosecond)
select {
case <-mp.stopC:
return
default:
// check first root inode
if mp.hasInode(ino) {
return
}
if !mp.raftPartition.IsRaftLeader() {
continue
}
// qinode := &MetaQuotaInode{
// inode: ino,
// quotaIds: make([]uint32, 0, 0),
// }
// data, err := qinode.Marshal()
// if err != nil {
// log.LogFatalf("[initInode] marshal: %s", err.Error())
// }
data, err := ino.Marshal()
if err != nil {
log.LogFatalf("[initInode] marshal: %s", err.Error())
}
// put first root inode
resp, err := mp.submit(opFSMCreateInode, data)
if err != nil {
log.LogFatalf("[initInode] raft sync: %s", err.Error())
}
p := &Packet{}
p.ResultCode = resp.(uint8)
log.LogDebugf("[initInode] raft sync: response status = %v.",
p.GetResultMsg())
return
}
}
}
// Not implemented.
func (mp *metaPartition) decommissionPartition() (err error) {
return
}
func (mp *metaPartition) fsmUpdatePartition(end uint64) (status uint8,
err error) {
status = proto.OpOk
oldEnd := mp.config.End
mp.config.End = end
if end < mp.config.Cursor {
status = proto.OpAgain
mp.config.End = oldEnd
return
}
if err = mp.PersistMetadata(); err != nil {
status = proto.OpDiskErr
mp.config.End = oldEnd
}
return
}
func (mp *metaPartition) confAddNode(req *proto.AddMetaPartitionRaftMemberRequest, index uint64) (updated bool, err error) {
var (
heartbeatPort int
replicaPort int
)
if heartbeatPort, replicaPort, err = mp.getRaftPort(); err != nil {
return
}
addPeer := false
for _, peer := range mp.config.Peers {
if peer.ID == req.AddPeer.ID {
addPeer = true
break
}
}
updated = !addPeer
if !updated {
return
}
mp.config.Peers = append(mp.config.Peers, req.AddPeer)
addr := strings.Split(req.AddPeer.Addr, ":")[0]
mp.config.RaftStore.AddNodeWithPort(req.AddPeer.ID, addr, heartbeatPort, replicaPort)
return
}
func (mp *metaPartition) confRemoveNode(req *proto.RemoveMetaPartitionRaftMemberRequest, index uint64) (updated bool, err error) {
var canRemoveSelf bool
if canRemoveSelf, err = mp.canRemoveSelf(); err != nil {
return
}
peerIndex := -1
data, _ := json.Marshal(req)
log.LogInfof("Start RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ",
req.PartitionId, mp.config.NodeId, string(data))
for i, peer := range mp.config.Peers {
if peer.ID == req.RemovePeer.ID {
updated = true
peerIndex = i
break
}
}
if !updated {
log.LogInfof("NoUpdate RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ",
req.PartitionId, mp.config.NodeId, string(data))
return
}
mp.config.Peers = append(mp.config.Peers[:peerIndex], mp.config.Peers[peerIndex+1:]...)
if mp.config.NodeId == req.RemovePeer.ID && !mp.isLoadingMetaPartition && canRemoveSelf {
mp.Stop()
mp.DeleteRaft()
mp.manager.deletePartition(mp.GetBaseConfig().PartitionId)
os.RemoveAll(mp.config.RootDir)
updated = false
}
log.LogInfof("Fininsh RemoveRaftNode PartitionID(%v) nodeID(%v) do RaftLog (%v) ",
req.PartitionId, mp.config.NodeId, string(data))
return
}
func (mp *metaPartition) delOldExtentFile(buf []byte) (err error) {
fileName := string(buf)
log.LogWarnf("[delOldExtentFile] del extent file(%s), mp[%v]", fileName, mp.config.PartitionId)
infos, err := ioutil.ReadDir(mp.config.RootDir)
if err != nil {
return
}
infos = sortDelExtFileInfo(infos)
tgtIdx := getDelExtFileIdx(fileName)
for _, f := range infos {
idx := getDelExtFileIdx(f.Name())
if idx > tgtIdx {
break
}
log.LogWarnf("[delOldExtentFile] del extent file(%s), mp[%v]", f.Name(), mp.config.PartitionId)
os.Remove(path.Join(mp.config.RootDir, f.Name()))
}
return
}
//
func (mp *metaPartition) setExtentDeleteFileCursor(buf []byte) (err error) {
str := string(buf)
var (
fileName string
cursor int64
)
_, err = fmt.Sscanf(str, "%s %d", &fileName, &cursor)
log.LogInfof("[setExtentDeleteFileCursor] &fileName_&cursor(%s), mp[%v]", str, mp.config.PartitionId)
if err != nil {
return
}
fp, err := os.OpenFile(path.Join(mp.config.RootDir, fileName), os.O_CREATE|os.O_RDWR,
0o644)
if err != nil {
log.LogErrorf("[setExtentDeleteFileCursor] openFile %s failed: %s",
fileName, err.Error())
return
}
if err = binary.Write(fp, binary.BigEndian, cursor); err != nil {
log.LogErrorf("[setExtentDeleteFileCursor] write file %s cursor"+
" failed: %s", fileName, err.Error())
}
// TODO Unhandled errors
fp.Close()
return
}
func (mp *metaPartition) CanRemoveRaftMember(peer proto.Peer) error {
downReplicas := mp.config.RaftStore.RaftServer().GetDownReplicas(mp.config.PartitionId)
hasExsit := false
for _, p := range mp.config.Peers {
if p.ID == peer.ID {
hasExsit = true
break
}
}
if !hasExsit {
return nil
}
hasDownReplicasExcludePeer := make([]uint64, 0)
for _, nodeID := range downReplicas {
if nodeID.NodeID == peer.ID {
continue
}
hasDownReplicasExcludePeer = append(hasDownReplicasExcludePeer, nodeID.NodeID)
}
sumReplicas := len(mp.config.Peers)
if sumReplicas%2 == 1 {
if sumReplicas-len(hasDownReplicasExcludePeer) > (sumReplicas/2 + 1) {
return nil
}
} else {
if sumReplicas-len(hasDownReplicasExcludePeer) >= (sumReplicas/2 + 1) {
return nil
}
}
return fmt.Errorf("downReplicas(%v) too much,so donnot offline (%v)", downReplicas, peer)
}
func (mp *metaPartition) IsEquareCreateMetaPartitionRequst(request *proto.CreateMetaPartitionRequest) (err error) {
if len(mp.config.Peers) != len(request.Members) {
return fmt.Errorf("Exsit unavali Partition(%v) partitionHosts(%v) requestHosts(%v)", mp.config.PartitionId, mp.config.Peers, request.Members)
}
if mp.config.Start != request.Start || mp.config.End != request.End {
return fmt.Errorf("Exsit unavali Partition(%v) range(%v-%v) requestRange(%v-%v)", mp.config.PartitionId, mp.config.Start, mp.config.End, request.Start, request.End)
}
for index, peer := range mp.config.Peers {
requestPeer := request.Members[index]
if requestPeer.ID != peer.ID || requestPeer.Addr != peer.Addr {
return fmt.Errorf("Exsit unavali Partition(%v) partitionHosts(%v) requestHosts(%v)", mp.config.PartitionId, mp.config.Peers, request.Members)
}
}
if mp.config.VolName != request.VolName {
return fmt.Errorf("Exsit unavali Partition(%v) VolName(%v) requestVolName(%v)", mp.config.PartitionId, mp.config.VolName, request.VolName)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"strings"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/log"
)
type DentryResponse struct {
Status uint8
Msg *Dentry
}
func NewDentryResponse() *DentryResponse {
return &DentryResponse{
Msg: &Dentry{},
}
}
func (mp *metaPartition) fsmTxCreateDentry(txDentry *TxDentry) (status uint8) {
done := mp.txProcessor.txManager.txInRMDone(txDentry.TxInfo.TxID)
if done {
log.LogWarnf("fsmTxCreateDentry: tx is already finish. txId %s", txDentry.TxInfo.TxID)
status = proto.OpTxInfoNotExistErr
return
}
txDI := proto.NewTxDentryInfo("", txDentry.Dentry.ParentId, txDentry.Dentry.Name, 0)
txDenInfo, ok := txDentry.TxInfo.TxDentryInfos[txDI.GetKey()]
if !ok {
status = proto.OpTxDentryInfoNotExistErr
return
}
rbDentry := NewTxRollbackDentry(txDentry.Dentry, txDenInfo, TxDelete)
status = mp.txProcessor.txResource.addTxRollbackDentry(rbDentry)
if status == proto.OpExistErr {
return proto.OpOk
}
if status != proto.OpOk {
return
}
defer func() {
if status != proto.OpOk {
mp.txProcessor.txResource.deleteTxRollbackDentry(txDenInfo.ParentId, txDenInfo.Name, txDenInfo.TxID)
}
}()
return mp.fsmCreateDentry(txDentry.Dentry, false)
}
// Insert a dentry into the dentry tree.
func (mp *metaPartition) fsmCreateDentry(dentry *Dentry,
forceUpdate bool) (status uint8) {
status = proto.OpOk
var parIno *Inode
if !forceUpdate {
item := mp.inodeTree.CopyGet(NewInode(dentry.ParentId, 0))
if item == nil {
log.LogErrorf("action[fsmCreateDentry] mp[%v] ParentId [%v] get nil, dentry name [%v], inode[%v]", mp.config.PartitionId, dentry.ParentId, dentry.Name, dentry.Inode)
status = proto.OpNotExistErr
return
}
parIno = item.(*Inode)
if parIno.ShouldDelete() {
log.LogErrorf("action[fsmCreateDentry] mp[%v] ParentId [%v] get [%v] but should del, dentry name [%v], inode[%v]", mp.config.PartitionId, dentry.ParentId, parIno, dentry.Name, dentry.Inode)
status = proto.OpNotExistErr
return
}
if !proto.IsDir(parIno.Type) {
log.LogErrorf("action[fsmCreateDentry] mp[%v] ParentId [%v] get [%v] but should del, dentry name [%v], inode[%v]", mp.config.PartitionId, dentry.ParentId, parIno, dentry.Name, dentry.Inode)
status = proto.OpArgMismatchErr
return
}
}
if item, ok := mp.dentryTree.ReplaceOrInsert(dentry, false); !ok {
// do not allow directories and files to overwrite each
// other when renaming
d := item.(*Dentry)
if d.isDeleted() {
log.LogDebugf("action[fsmCreateDentry] mp[%v] newest dentry %v be set deleted flag", mp.config.PartitionId, d)
d.Inode = dentry.Inode
if d.getVerSeq() == dentry.getVerSeq() {
d.setVerSeq(dentry.getSeqFiled())
} else {
if d.getSnapListLen() > 0 && d.multiSnap.dentryList[0].isDeleted() {
d.setVerSeq(dentry.getSeqFiled())
} else {
d.addVersion(dentry.getSeqFiled())
}
}
d.Type = dentry.Type
d.ParentId = dentry.ParentId
log.LogDebugf("action[fsmCreateDentry.ver] mp[%v] latest dentry already deleted.Now create new one [%v]", mp.config.PartitionId, dentry)
if !forceUpdate {
parIno.IncNLink(mp.verSeq)
parIno.SetMtime()
}
return
} else if proto.OsModeType(dentry.Type) != proto.OsModeType(d.Type) && !proto.IsSymlink(dentry.Type) && !proto.IsSymlink(d.Type) {
log.LogErrorf("action[fsmCreateDentry] ParentId [%v] get [%v] but should del, dentry name [%v], inode[%v], type[%v,%v],dir[%v,%v]",
dentry.ParentId, parIno, dentry.Name, dentry.Inode, dentry.Type, d.Type, proto.IsSymlink(dentry.Type), proto.IsSymlink(d.Type))
status = proto.OpArgMismatchErr
return
} else if dentry.ParentId == d.ParentId && strings.Compare(dentry.Name, d.Name) == 0 && dentry.Inode == d.Inode {
log.LogDebugf("action[fsmCreateDentry.ver] mp[%v] no need repeat create new one [%v]", mp.config.PartitionId, dentry)
return
}
log.LogErrorf("action[fsmCreateDentry.ver] mp[%v] dentry already exist [%v] and diff with the request [%v]", mp.config.PartitionId, d, dentry)
status = proto.OpExistErr
return
}
if !forceUpdate {
parIno.IncNLink(mp.verSeq)
parIno.SetMtime()
}
return
}
func (mp *metaPartition) getDentryList(dentry *Dentry) (denList []proto.DetryInfo) {
item := mp.dentryTree.Get(dentry)
if item != nil {
if item.(*Dentry).getSnapListLen() == 0 {
return
}
for _, den := range item.(*Dentry).multiSnap.dentryList {
denList = append(denList, proto.DetryInfo{
Inode: den.Inode,
Mode: den.Type,
IsDel: den.isDeleted(),
VerSeq: den.getVerSeq(),
})
}
}
return
}
// Query a dentry from the dentry tree with specified dentry info.
func (mp *metaPartition) getDentry(dentry *Dentry) (*Dentry, uint8) {
status := proto.OpOk
item := mp.dentryTree.Get(dentry)
if item == nil {
status = proto.OpNotExistErr
return nil, status
}
log.LogDebugf("action[getDentry] get dentry[%v] by req dentry %v", item.(*Dentry), dentry)
den := mp.getDentryByVerSeq(item.(*Dentry), dentry.getSeqFiled())
if den != nil {
return den, proto.OpOk
}
return den, proto.OpNotExistErr
}
func (mp *metaPartition) fsmTxDeleteDentry(txDentry *TxDentry) (resp *DentryResponse) {
resp = NewDentryResponse()
resp.Status = proto.OpOk
if mp.txProcessor.txManager.txInRMDone(txDentry.TxInfo.TxID) {
log.LogWarnf("fsmTxDeleteDentry: tx is already finish. txId %s", txDentry.TxInfo.TxID)
resp.Status = proto.OpTxInfoNotExistErr
return
}
tmpDen := txDentry.Dentry
txDI := proto.NewTxDentryInfo("", tmpDen.ParentId, tmpDen.Name, 0)
txDenInfo, ok := txDentry.TxInfo.TxDentryInfos[txDI.GetKey()]
if !ok {
resp.Status = proto.OpTxDentryInfoNotExistErr
return
}
rbDentry := NewTxRollbackDentry(tmpDen, txDenInfo, TxAdd)
resp.Status = mp.txProcessor.txResource.addTxRollbackDentry(rbDentry)
if resp.Status == proto.OpExistErr {
resp.Status = proto.OpOk
return
}
if resp.Status != proto.OpOk {
return
}
defer func() {
if resp.Status != proto.OpOk {
mp.txProcessor.txResource.deleteTxRollbackDentry(txDenInfo.ParentId, txDenInfo.Name, txDenInfo.TxID)
}
}()
item := mp.dentryTree.Get(tmpDen)
if item == nil || item.(*Dentry).Inode != tmpDen.Inode {
log.LogWarnf("fsmTxDeleteDentry: got wrong dentry, want %v, got %v", tmpDen, item)
resp.Status = proto.OpNotExistErr
return
}
mp.dentryTree.Delete(tmpDen)
// parent link count not change
resp.Msg = item.(*Dentry)
return
}
// Delete dentry from the dentry tree.
func (mp *metaPartition) fsmDeleteDentry(denParm *Dentry, checkInode bool) (resp *DentryResponse) {
log.LogDebugf("action[fsmDeleteDentry] mp[%v] delete param (%v) seq [%v]", mp.config.PartitionId, denParm, denParm.getSeqFiled())
resp = NewDentryResponse()
resp.Status = proto.OpOk
var (
denFound *Dentry
item interface{}
doMore = true
clean bool
)
if checkInode {
log.LogDebugf("action[fsmDeleteDentry] mp[%v] delete param %v", mp.config.PartitionId, denParm)
item = mp.dentryTree.Execute(func(tree *btree.BTree) interface{} {
d := tree.CopyGet(denParm)
if d == nil {
return nil
}
den := d.(*Dentry)
if den.Inode != denParm.Inode {
return nil
}
if mp.verSeq == 0 {
log.LogDebugf("action[fsmDeleteDentry] mp[%v] volume snapshot not enabled,delete directly", mp.config.PartitionId)
denFound = den
return mp.dentryTree.tree.Delete(den)
}
denFound, doMore, clean = den.deleteVerSnapshot(denParm.getSeqFiled(), mp.verSeq, mp.GetVerList())
return den
})
} else {
log.LogDebugf("action[fsmDeleteDentry] mp[%v] denParm dentry %v", mp.config.PartitionId, denParm)
if mp.verSeq == 0 {
item = mp.dentryTree.Delete(denParm)
if item != nil {
denFound = item.(*Dentry)
}
} else {
item = mp.dentryTree.Get(denParm)
if item != nil {
denFound, doMore, clean = item.(*Dentry).deleteVerSnapshot(denParm.getSeqFiled(), mp.verSeq, mp.GetVerList())
}
}
}
if item != nil && (clean == true || (item.(*Dentry).getSnapListLen() == 0 && item.(*Dentry).isDeleted())) {
log.LogDebugf("action[fsmDeleteDentry] mp[%v] dnetry %v really be deleted", mp.config.PartitionId, item.(*Dentry))
item = mp.dentryTree.Delete(item.(*Dentry))
}
if !doMore { // not the top layer,do nothing to parent inode
if denFound != nil {
resp.Msg = denFound
}
log.LogDebugf("action[fsmDeleteDentry] mp[%v] there's nothing to do more denParm %v", mp.config.PartitionId, denParm)
return
}
if denFound == nil {
resp.Status = proto.OpNotExistErr
log.LogErrorf("action[fsmDeleteDentry] mp[%v] not found dentry %v", mp.config.PartitionId, denParm)
return
} else {
mp.inodeTree.CopyFind(NewInode(denParm.ParentId, 0),
func(item BtreeItem) {
if item != nil { // no matter
ino := item.(*Inode)
if !ino.ShouldDelete() {
log.LogDebugf("action[fsmDeleteDentry] mp[%v] den %v delete parent's link", mp.config.PartitionId, denParm)
if denParm.getSeqFiled() == 0 {
item.(*Inode).DecNLink()
}
log.LogDebugf("action[fsmDeleteDentry] mp[%v] inode[%v] be unlinked by child name %v", mp.config.PartitionId, item.(*Inode).Inode, denParm.Name)
item.(*Inode).SetMtime()
}
}
})
}
resp.Msg = denFound
return
}
// batch Delete dentry from the dentry tree.
func (mp *metaPartition) fsmBatchDeleteDentry(db DentryBatch) []*DentryResponse {
result := make([]*DentryResponse, 0, len(db))
for _, dentry := range db {
status := mp.dentryInTx(dentry.ParentId, dentry.Name)
if status != proto.OpOk {
result = append(result, &DentryResponse{Status: status})
continue
}
result = append(result, mp.fsmDeleteDentry(dentry, true))
}
return result
}
func (mp *metaPartition) fsmTxUpdateDentry(txUpDateDentry *TxUpdateDentry) (resp *DentryResponse) {
resp = NewDentryResponse()
resp.Status = proto.OpOk
if mp.txProcessor.txManager.txInRMDone(txUpDateDentry.TxInfo.TxID) {
log.LogWarnf("fsmTxUpdateDentry: tx is already finish. txId %s", txUpDateDentry.TxInfo.TxID)
resp.Status = proto.OpTxInfoNotExistErr
return
}
newDen := txUpDateDentry.NewDentry
oldDen := txUpDateDentry.OldDentry
txDI := proto.NewTxDentryInfo("", oldDen.ParentId, oldDen.Name, 0)
txDenInfo, ok := txUpDateDentry.TxInfo.TxDentryInfos[txDI.GetKey()]
if !ok {
resp.Status = proto.OpTxDentryInfoNotExistErr
return
}
item := mp.dentryTree.CopyGet(oldDen)
if item == nil || item.(*Dentry).Inode != oldDen.Inode {
resp.Status = proto.OpNotExistErr
log.LogWarnf("fsmTxUpdateDentry: find dentry is not right, want %v, got %v", oldDen, item)
return
}
rbDentry := NewTxRollbackDentry(txUpDateDentry.OldDentry, txDenInfo, TxUpdate)
resp.Status = mp.txProcessor.txResource.addTxRollbackDentry(rbDentry)
if resp.Status == proto.OpExistErr {
resp.Status = proto.OpOk
return
}
if resp.Status != proto.OpOk {
return
}
d := item.(*Dentry)
d.Inode, newDen.Inode = newDen.Inode, d.Inode
resp.Msg = newDen
return
}
func (mp *metaPartition) fsmUpdateDentry(dentry *Dentry) (
resp *DentryResponse) {
resp = NewDentryResponse()
resp.Status = proto.OpOk
mp.dentryTree.CopyFind(dentry, func(item BtreeItem) {
if item == nil {
resp.Status = proto.OpNotExistErr
return
}
d := item.(*Dentry)
if dentry.Inode == d.Inode {
return
}
if d.getVerSeq() < mp.GetVerSeq() {
dn := d.CopyDirectly()
dn.(*Dentry).setVerSeq(d.getVerSeq())
d.setVerSeq(mp.GetVerSeq())
d.multiSnap.dentryList = append([]*Dentry{dn.(*Dentry)}, d.multiSnap.dentryList...)
}
d.Inode, dentry.Inode = dentry.Inode, d.Inode
resp.Msg = dentry
})
return
}
func (mp *metaPartition) getDentryTree() *BTree {
return mp.dentryTree.GetTree()
}
func (mp *metaPartition) getDentryByVerSeq(dy *Dentry, verSeq uint64) (d *Dentry) {
d, _ = dy.getDentryFromVerList(verSeq, false)
return
}
func (mp *metaPartition) readDirOnly(req *ReadDirOnlyReq) (resp *ReadDirOnlyResp) {
resp = &ReadDirOnlyResp{}
begDentry := &Dentry{
ParentId: req.ParentID,
}
endDentry := &Dentry{
ParentId: req.ParentID + 1,
}
mp.dentryTree.AscendRange(begDentry, endDentry, func(i BtreeItem) bool {
if proto.IsDir(i.(*Dentry).Type) {
d := mp.getDentryByVerSeq(i.(*Dentry), req.VerSeq)
if d == nil {
return true
}
resp.Children = append(resp.Children, proto.Dentry{
Inode: d.Inode,
Type: d.Type,
Name: d.Name,
})
}
return true
})
return
}
func (mp *metaPartition) readDir(req *ReadDirReq) (resp *ReadDirResp) {
resp = &ReadDirResp{}
begDentry := &Dentry{
ParentId: req.ParentID,
}
endDentry := &Dentry{
ParentId: req.ParentID + 1,
}
mp.dentryTree.AscendRange(begDentry, endDentry, func(i BtreeItem) bool {
d := mp.getDentryByVerSeq(i.(*Dentry), req.VerSeq)
if d == nil {
return true
}
resp.Children = append(resp.Children, proto.Dentry{
Inode: d.Inode,
Type: d.Type,
Name: d.Name,
})
return true
})
return
}
// Read dentry from btree by limit count
// if req.Marker == "" and req.Limit == 0, it becomes readDir
// else if req.Marker != "" and req.Limit == 0, return dentries from pid:name to pid+1
// else if req.Marker == "" and req.Limit != 0, return dentries from pid with limit count
// else if req.Marker != "" and req.Limit != 0, return dentries from pid:marker to pid:xxxx with limit count
//
func (mp *metaPartition) readDirLimit(req *ReadDirLimitReq) (resp *ReadDirLimitResp) {
log.LogDebugf("action[readDirLimit] mp[%v] req %v", mp.config.PartitionId, req)
resp = &ReadDirLimitResp{}
startDentry := &Dentry{
ParentId: req.ParentID,
}
if len(req.Marker) > 0 {
startDentry.Name = req.Marker
}
endDentry := &Dentry{
ParentId: req.ParentID + 1,
}
mp.dentryTree.AscendRange(startDentry, endDentry, func(i BtreeItem) bool {
if !proto.IsDir(i.(*Dentry).Type) && (req.VerOpt&uint8(proto.FlagsSnapshotDel) > 0) {
if req.VerOpt&uint8(proto.FlagsSnapshotDelDir) > 0 {
return true
}
if !i.(*Dentry).isEffective(req.VerSeq) {
return true
}
}
d := mp.getDentryByVerSeq(i.(*Dentry), req.VerSeq)
if d == nil {
return true
}
resp.Children = append(resp.Children, proto.Dentry{
Inode: d.Inode,
Type: d.Type,
Name: d.Name,
})
// Limit == 0 means no limit.
if req.Limit > 0 && uint64(len(resp.Children)) >= req.Limit {
return false
}
return true
})
log.LogDebugf("action[readDirLimit] mp[%v] resp %v", mp.config.PartitionId, resp)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"fmt"
"math"
"github.com/cubefs/cubefs/util/log"
)
type ExtendOpResult struct {
Status uint8
Extend *Extend
}
func (mp *metaPartition) fsmSetXAttr(extend *Extend) (err error) {
extend.verSeq = mp.GetVerSeq()
treeItem := mp.extendTree.CopyGet(extend)
var e *Extend
if treeItem == nil {
mp.extendTree.ReplaceOrInsert(extend, true)
} else {
// attr multi-ver copy all attr for simplify management
e = treeItem.(*Extend)
if e.verSeq != extend.verSeq {
if extend.verSeq < e.verSeq {
return fmt.Errorf("seq error assign %v but less than %v", extend.verSeq, e.verSeq)
}
e.multiVers = append([]*Extend{e.Copy().(*Extend)}, e.multiVers...)
e.verSeq = extend.verSeq
}
e.Merge(extend, true)
}
return
}
// todo(leon chang):check snapshot delete relation with attr
func (mp *metaPartition) fsmRemoveXAttr(reqExtend *Extend) (err error) {
treeItem := mp.extendTree.CopyGet(reqExtend)
if treeItem == nil {
return
}
e := treeItem.(*Extend)
if mp.GetVerSeq() == 0 || (e.verSeq == mp.GetVerSeq() && reqExtend.verSeq == 0) {
reqExtend.Range(func(key, value []byte) bool {
e.Remove(key)
return true
})
return
}
if reqExtend.verSeq == 0 {
reqExtend.verSeq = mp.GetVerSeq()
}
if reqExtend.verSeq == math.MaxUint64 {
reqExtend.verSeq = 0
}
e.versionMu.Lock()
defer e.versionMu.Unlock()
if reqExtend.verSeq < e.GetMinVer() {
return
}
mp.multiVersionList.RWLock.RLock()
defer mp.multiVersionList.RWLock.RUnlock()
if reqExtend.verSeq > e.verSeq {
e.multiVers = append([]*Extend{e.Copy().(*Extend)}, e.multiVers...)
e.verSeq = reqExtend.verSeq
reqExtend.Range(func(key, value []byte) bool {
e.Remove(key)
return true
})
} else if reqExtend.verSeq == e.verSeq {
var globalNewVer uint64
if globalNewVer, err = mp.multiVersionList.GetNextNewerVer(reqExtend.verSeq); err != nil {
log.LogErrorf("fsmRemoveXAttr. mp[%v] seq [%v] req ver [%v] not found newer seq", mp.config.PartitionId, mp.verSeq, reqExtend.verSeq)
return err
}
e.verSeq = globalNewVer
} else {
innerLastVer := e.verSeq
for id, ele := range e.multiVers {
if ele.verSeq > reqExtend.verSeq {
innerLastVer = ele.verSeq
continue
} else if ele.verSeq < reqExtend.verSeq {
return
} else {
var globalNewVer uint64
if globalNewVer, err = mp.multiVersionList.GetNextNewerVer(ele.verSeq); err != nil {
return err
}
if globalNewVer < innerLastVer {
log.LogDebugf("mp[%v] inode[%v] extent layer %v update seq [%v] to %v",
mp.config.PartitionId, ele.inode, id, ele.verSeq, globalNewVer)
ele.verSeq = globalNewVer
return
}
e.multiVers = append(e.multiVers[:id], e.multiVers[id+1:]...)
return
}
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/timeutil"
)
type InodeResponse struct {
Status uint8
Msg *Inode
}
func NewInodeResponse() *InodeResponse {
return &InodeResponse{}
}
// Create and inode and attach it to the inode tree.
func (mp *metaPartition) fsmTxCreateInode(txIno *TxInode, quotaIds []uint32) (status uint8) {
status = proto.OpOk
if mp.txProcessor.txManager.txInRMDone(txIno.TxInfo.TxID) {
log.LogWarnf("fsmTxCreateInode: tx is already finish. txId %s", txIno.TxInfo.TxID)
return proto.OpTxInfoNotExistErr
}
// inodeInfo := mp.txProcessor.txManager.getTxInodeInfo(txIno.TxInfo.TxID, txIno.Inode.Inode)
inodeInfo, ok := txIno.TxInfo.TxInodeInfos[txIno.Inode.Inode]
if !ok {
status = proto.OpTxInodeInfoNotExistErr
return
}
rbInode := NewTxRollbackInode(txIno.Inode, quotaIds, inodeInfo, TxDelete)
status = mp.txProcessor.txResource.addTxRollbackInode(rbInode)
if status != proto.OpOk {
return
}
defer func() {
if status != proto.OpOk {
mp.txProcessor.txResource.deleteTxRollbackInode(txIno.Inode.Inode, txIno.TxInfo.TxID)
}
}()
// 3.insert inode in inode tree
return mp.fsmCreateInode(txIno.Inode)
}
// Create and inode and attach it to the inode tree.
func (mp *metaPartition) fsmCreateInode(ino *Inode) (status uint8) {
if status = mp.uidManager.addUidSpace(ino.Uid, ino.Inode, nil); status != proto.OpOk {
return
}
status = proto.OpOk
if _, ok := mp.inodeTree.ReplaceOrInsert(ino, false); !ok {
status = proto.OpExistErr
}
return
}
func (mp *metaPartition) fsmTxCreateLinkInode(txIno *TxInode) (resp *InodeResponse) {
resp = NewInodeResponse()
resp.Status = proto.OpOk
if mp.txProcessor.txManager.txInRMDone(txIno.TxInfo.TxID) {
log.LogWarnf("fsmTxCreateLinkInode: tx is already finish. txId %s", txIno.TxInfo.TxID)
resp.Status = proto.OpTxInfoNotExistErr
return
}
// 2.register rollback item
inodeInfo, ok := txIno.TxInfo.TxInodeInfos[txIno.Inode.Inode]
if !ok {
resp.Status = proto.OpTxInodeInfoNotExistErr
return
}
rbInode := NewTxRollbackInode(txIno.Inode, []uint32{}, inodeInfo, TxDelete)
resp.Status = mp.txProcessor.txResource.addTxRollbackInode(rbInode)
if resp.Status == proto.OpExistErr {
resp.Status = proto.OpOk
resp.Msg = txIno.Inode
return
}
if resp.Status != proto.OpOk {
return
}
defer func() {
if resp.Status != proto.OpOk {
mp.txProcessor.txResource.deleteTxRollbackInode(txIno.Inode.Inode, txIno.TxInfo.TxID)
}
}()
return mp.fsmCreateLinkInode(txIno.Inode, 0)
}
func (mp *metaPartition) fsmCreateLinkInode(ino *Inode, uniqID uint64) (resp *InodeResponse) {
resp = NewInodeResponse()
resp.Status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
resp.Status = proto.OpNotExistErr
return
}
i := item.(*Inode)
if i.ShouldDelete() {
resp.Status = proto.OpNotExistErr
return
}
resp.Msg = i
if !mp.uniqChecker.legalIn(uniqID) {
log.LogWarnf("fsmCreateLinkInode repeated, ino[%v] uniqID %v nlink %v", ino.Inode, uniqID, ino.GetNLink())
return
}
i.IncNLink(ino.getVer())
return
}
func (mp *metaPartition) getInodeByVer(ino *Inode) (i *Inode) {
item := mp.inodeTree.Get(ino)
if item == nil {
log.LogDebugf("action[getInodeByVer] not found ino[%v] verseq [%v]", ino.Inode, ino.getVer())
return
}
i, _ = item.(*Inode).getInoByVer(ino.getVer(), false)
return
}
func (mp *metaPartition) getInodeTopLayer(ino *Inode) (resp *InodeResponse) {
resp = NewInodeResponse()
resp.Status = proto.OpOk
item := mp.inodeTree.Get(ino)
if item == nil {
resp.Status = proto.OpNotExistErr
log.LogDebugf("action[getInodeTopLayer] not found ino[%v] verseq [%v]", ino.Inode, ino.getVer())
return
}
i := item.(*Inode)
ctime := timeutil.GetCurrentTimeUnix()
/*
* FIXME: not protected by lock yet, since nothing is depending on atime.
* Shall add inode lock in the future.
*/
if ctime > i.AccessTime {
i.AccessTime = ctime
}
resp.Msg = i
return
}
func (mp *metaPartition) getInode(ino *Inode, listAll bool) (resp *InodeResponse) {
resp = NewInodeResponse()
resp.Status = proto.OpOk
i := mp.getInodeByVer(ino)
if i == nil || (listAll == false && i.ShouldDelete()) {
log.LogDebugf("action[getInode] ino %v not found", ino)
resp.Status = proto.OpNotExistErr
return
}
ctime := timeutil.GetCurrentTimeUnix()
/*
* FIXME: not protected by lock yet, since nothing is depending on atime.
* Shall add inode lock in the future.
*/
if ctime > i.AccessTime {
i.AccessTime = ctime
}
resp.Msg = i
return
}
func (mp *metaPartition) hasInode(ino *Inode) (ok bool) {
item := mp.inodeTree.Get(ino)
if item == nil {
return
}
i := mp.getInodeByVer(ino)
if i == nil || i.ShouldDelete() {
return
}
ok = true
return
}
// Ascend is the wrapper of inodeTree.Ascend
func (mp *metaPartition) Ascend(f func(i BtreeItem) bool) {
mp.inodeTree.Ascend(f)
}
func (mp *metaPartition) fsmTxUnlinkInode(txIno *TxInode) (resp *InodeResponse) {
resp = NewInodeResponse()
resp.Status = proto.OpOk
if proto.IsDir(txIno.Inode.Type) && txIno.TxInfo.TxType == proto.TxTypeRemove && txIno.Inode.NLink > 2 {
resp.Status = proto.OpNotEmpty
log.LogWarnf("fsmTxUnlinkInode: dir is not empty, can't remove it, txinode[%v]", txIno)
return
}
if mp.txProcessor.txManager.txInRMDone(txIno.TxInfo.TxID) {
log.LogWarnf("fsmTxUnlinkInode: tx is already finish. txId %s", txIno.TxInfo.TxID)
resp.Status = proto.OpTxInfoNotExistErr
return
}
inodeInfo, ok := txIno.TxInfo.TxInodeInfos[txIno.Inode.Inode]
if !ok {
resp.Status = proto.OpTxInodeInfoNotExistErr
return
}
var quotaIds []uint32
quotaIds, _ = mp.isExistQuota(txIno.Inode.Inode)
rbInode := NewTxRollbackInode(txIno.Inode, quotaIds, inodeInfo, TxAdd)
resp.Status = mp.txProcessor.txResource.addTxRollbackInode(rbInode)
if resp.Status == proto.OpExistErr {
resp.Status = proto.OpOk
item := mp.inodeTree.Get(txIno.Inode)
if item != nil {
resp.Msg = item.(*Inode)
}
return
}
if resp.Status != proto.OpOk {
return
}
defer func() {
if resp.Status != proto.OpOk {
mp.txProcessor.txResource.deleteTxRollbackInode(txIno.Inode.Inode, txIno.TxInfo.TxID)
}
}()
resp = mp.fsmUnlinkInode(txIno.Inode, 0)
if resp.Status != proto.OpOk {
return
}
if txIno.TxInfo.TxType == proto.TxTypeRename {
mp.fsmEvictInode(txIno.Inode)
}
return
}
// normal unlink seq is 0
// snapshot unlink seq is snapshotVersion
// fsmUnlinkInode delete the specified inode from inode tree.
func (mp *metaPartition) fsmUnlinkInode(ino *Inode, uniqID uint64) (resp *InodeResponse) {
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, ino)
var ext2Del []proto.ExtentKey
resp = NewInodeResponse()
resp.Status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, ino)
resp.Status = proto.OpNotExistErr
return
}
inode := item.(*Inode)
if ino.getVer() == 0 && inode.ShouldDelete() {
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, ino)
resp.Status = proto.OpNotExistErr
return
}
resp.Msg = inode
if !mp.uniqChecker.legalIn(uniqID) {
log.LogWarnf("fsmUnlinkInode repeat, mp[%v] ino[%v] uniqID %v nlink %v", mp.config.PartitionId, ino.Inode, uniqID, ino.GetNLink())
return
}
log.LogDebugf("action[fsmUnlinkInode] mp[%v] get inode[%v]", mp.config.PartitionId, inode)
var (
doMore bool
status = proto.OpOk
)
if ino.getVer() == 0 {
ext2Del, doMore, status = inode.unlinkTopLayer(mp.config.PartitionId, ino, mp.verSeq, mp.multiVersionList)
} else { // means drop snapshot
log.LogDebugf("action[fsmUnlinkInode] mp[%v] req drop assigned snapshot reqseq [%v] inode seq [%v]", mp.config.PartitionId, ino.getVer(), inode.getVer())
if ino.getVer() > inode.getVer() && !isInitSnapVer(ino.getVer()) {
log.LogDebugf("action[fsmUnlinkInode] mp[%v] inode[%v] unlink not exist snapshot and return do nothing.reqseq [%v] larger than inode seq [%v]",
mp.config.PartitionId, ino.Inode, ino.getVer(), inode.getVer())
return
} else {
ext2Del, doMore, status = inode.unlinkVerInList(mp.config.PartitionId, ino, mp.verSeq, mp.multiVersionList)
}
}
if !doMore {
resp.Status = status
return
}
if inode.IsEmptyDirAndNoSnapshot() {
if ino.NLink < 2 { // snapshot deletion
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v] really be deleted, empty dir", mp.config.PartitionId, inode)
mp.inodeTree.Delete(inode)
mp.updateUsedInfo(0, -1, inode.Inode)
}
} else if inode.IsTempFile() {
// all snapshot between create to last deletion cleaned
if inode.NLink == 0 && inode.getLayerLen() == 0 {
mp.updateUsedInfo(-1*int64(inode.Size), -1, inode.Inode)
log.LogDebugf("action[fsmUnlinkInode] mp[%v] unlink inode[%v] and push to freeList", mp.config.PartitionId, inode)
inode.AccessTime = time.Now().Unix()
mp.freeList.Push(inode.Inode)
mp.uidManager.doMinusUidSpace(inode.Uid, inode.Inode, inode.Size)
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, inode)
}
}
if len(ext2Del) > 0 {
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v] DecSplitExts ext2Del %v", mp.config.PartitionId, ino, ext2Del)
inode.DecSplitExts(mp.config.PartitionId, ext2Del)
mp.extDelCh <- ext2Del
}
log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v] left", mp.config.PartitionId, inode)
return
}
// fsmUnlinkInode delete the specified inode from inode tree.
func (mp *metaPartition) fsmUnlinkInodeBatch(ib InodeBatch) (resp []*InodeResponse) {
for _, ino := range ib {
status := mp.inodeInTx(ino.Inode)
if status != proto.OpOk {
resp = append(resp, &InodeResponse{Status: status})
continue
}
resp = append(resp, mp.fsmUnlinkInode(ino, 0))
}
return
}
func (mp *metaPartition) internalHasInode(ino *Inode) bool {
return mp.inodeTree.Has(ino)
}
func (mp *metaPartition) internalDelete(val []byte) (err error) {
if len(val) == 0 {
return
}
buf := bytes.NewBuffer(val)
ino := NewInode(0, 0)
for {
err = binary.Read(buf, binary.BigEndian, &ino.Inode)
if err != nil {
if err == io.EOF {
err = nil
return
}
return
}
log.LogDebugf("internalDelete: received internal delete: partitionID(%v) inode[%v]",
mp.config.PartitionId, ino.Inode)
mp.internalDeleteInode(ino)
}
}
func (mp *metaPartition) internalDeleteBatch(val []byte) error {
if len(val) == 0 {
return nil
}
inodes, err := InodeBatchUnmarshal(val)
if err != nil {
return nil
}
for _, ino := range inodes {
log.LogDebugf("internalDelete: received internal delete: partitionID(%v) inode[%v]",
mp.config.PartitionId, ino.Inode)
mp.internalDeleteInode(ino)
}
return nil
}
func (mp *metaPartition) internalDeleteInode(ino *Inode) {
log.LogDebugf("action[internalDeleteInode] ino[%v] really be deleted", ino)
mp.inodeTree.Delete(ino)
mp.freeList.Remove(ino.Inode)
mp.extendTree.Delete(&Extend{inode: ino.Inode}) // Also delete extend attribute.
return
}
func (mp *metaPartition) fsmAppendExtents(ino *Inode) (status uint8) {
status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
status = proto.OpNotExistErr
return
}
ino2 := item.(*Inode)
if ino2.ShouldDelete() {
status = proto.OpNotExistErr
return
}
oldSize := int64(ino2.Size)
eks := ino.Extents.CopyExtents()
if status = mp.uidManager.addUidSpace(ino2.Uid, ino2.Inode, eks); status != proto.OpOk {
return
}
delExtents := ino2.AppendExtents(eks, ino.ModifyTime, mp.volType)
mp.updateUsedInfo(int64(ino2.Size)-oldSize, 0, ino2.Inode)
log.LogInfof("fsmAppendExtents mpId[%v].inode[%v] deleteExtents(%v)", mp.config.PartitionId, ino2.Inode, delExtents)
mp.uidManager.minusUidSpace(ino2.Uid, ino2.Inode, delExtents)
log.LogInfof("fsmAppendExtents mpId[%v].inode[%v] DecSplitExts deleteExtents(%v)", mp.config.PartitionId, ino2.Inode, delExtents)
ino2.DecSplitExts(mp.config.PartitionId, delExtents)
mp.extDelCh <- delExtents
return
}
func (mp *metaPartition) fsmAppendExtentsWithCheck(ino *Inode, isSplit bool) (status uint8) {
var (
delExtents []proto.ExtentKey
discardExtentKey []proto.ExtentKey
)
if mp.verSeq < ino.getVer() {
status = proto.OpArgMismatchErr
log.LogErrorf("fsmAppendExtentsWithCheck.mp[%v] param ino[%v] mp seq [%v]", mp.config.PartitionId, ino, mp.verSeq)
return
}
status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
status = proto.OpNotExistErr
return
}
fsmIno := item.(*Inode)
if fsmIno.ShouldDelete() {
status = proto.OpNotExistErr
return
}
oldSize := int64(fsmIno.Size)
eks := ino.Extents.CopyExtents()
if len(eks) < 1 {
return
}
if len(eks) > 1 {
discardExtentKey = eks[1:]
}
if status = mp.uidManager.addUidSpace(fsmIno.Uid, fsmIno.Inode, eks[:1]); status != proto.OpOk {
log.LogErrorf("fsmAppendExtentsWithCheck.mp[%v] addUidSpace status [%v]", mp.config.PartitionId, status)
return
}
log.LogDebugf("action[fsmAppendExtentsWithCheck] mp[%v] ver [%v] ino[%v] isSplit %v ek [%v] hist len %v discardExtentKey %v",
mp.config.PartitionId, mp.verSeq, fsmIno.Inode, isSplit, eks[0], fsmIno.getLayerLen(), discardExtentKey)
appendExtParam := &AppendExtParam{
mpId: mp.config.PartitionId,
mpVer: mp.verSeq,
ek: eks[0],
ct: ino.ModifyTime,
discardExtents: discardExtentKey,
volType: mp.volType,
multiVersionList: mp.multiVersionList,
}
if !isSplit {
delExtents, status = fsmIno.AppendExtentWithCheck(appendExtParam)
if status == proto.OpOk {
log.LogInfof("action[fsmAppendExtentsWithCheck] mp[%v] DecSplitExts delExtents [%v]", mp.config.PartitionId, delExtents)
fsmIno.DecSplitExts(appendExtParam.mpId, delExtents)
mp.extDelCh <- delExtents
}
// conflict need delete eks[0], to clear garbage data
if status == proto.OpConflictExtentsErr {
log.LogInfof("action[fsmAppendExtentsWithCheck] mp[%v] OpConflictExtentsErr [%v]", mp.config.PartitionId, eks[:1])
if !storage.IsTinyExtent(eks[0].ExtentId) && eks[0].ExtentOffset >= util.ExtentSize {
eks[0].SetSplit(true)
}
mp.extDelCh <- eks[:1]
}
} else {
// only the ek itself will be moved to level before
// ino verseq be set with mp ver before submit in case other mp be updated while on flight, which will lead to
// inconsistent between raft pairs
delExtents, status = fsmIno.SplitExtentWithCheck(appendExtParam)
log.LogInfof("action[fsmAppendExtentsWithCheck] mp[%v] DecSplitExts delExtents [%v]", mp.config.PartitionId, delExtents)
fsmIno.DecSplitExts(mp.config.PartitionId, delExtents)
mp.extDelCh <- delExtents
mp.uidManager.minusUidSpace(fsmIno.Uid, fsmIno.Inode, delExtents)
}
// conflict need delete eks[0], to clear garbage data
if status == proto.OpConflictExtentsErr {
mp.extDelCh <- eks[:1]
mp.uidManager.minusUidSpace(fsmIno.Uid, fsmIno.Inode, eks[:1])
log.LogDebugf("fsmAppendExtentsWithCheck mp[%v] delExtents inode[%v] ek(%v)", mp.config.PartitionId, fsmIno.Inode, delExtents)
}
mp.updateUsedInfo(int64(fsmIno.Size)-oldSize, 0, fsmIno.Inode)
log.LogInfof("fsmAppendExtentWithCheck mp[%v] inode[%v] ek(%v) deleteExtents(%v) discardExtents(%v) status(%v)",
mp.config.PartitionId, fsmIno.Inode, eks[0], delExtents, discardExtentKey, status)
return
}
func (mp *metaPartition) fsmAppendObjExtents(ino *Inode) (status uint8) {
status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
status = proto.OpNotExistErr
return
}
inode := item.(*Inode)
if inode.ShouldDelete() {
status = proto.OpNotExistErr
return
}
eks := ino.ObjExtents.CopyExtents()
err := inode.AppendObjExtents(eks, ino.ModifyTime)
// if err is not nil, means obj eks exist overlap.
if err != nil {
log.LogErrorf("fsmAppendExtents inode[%v] err(%v)", inode.Inode, err)
status = proto.OpConflictExtentsErr
}
return
}
func (mp *metaPartition) fsmExtentsTruncate(ino *Inode) (resp *InodeResponse) {
var err error
resp = NewInodeResponse()
log.LogDebugf("fsmExtentsTruncate. req ino[%v]", ino)
resp.Status = proto.OpOk
item := mp.inodeTree.Get(ino)
if item == nil {
resp.Status = proto.OpNotExistErr
return
}
i := item.(*Inode)
if i.ShouldDelete() {
resp.Status = proto.OpNotExistErr
return
}
if proto.IsDir(i.Type) {
resp.Status = proto.OpArgMismatchErr
return
}
doOnLastKey := func(lastKey *proto.ExtentKey) {
var eks []proto.ExtentKey
eks = append(eks, *lastKey)
mp.uidManager.minusUidSpace(i.Uid, i.Inode, eks)
}
insertSplitKey := func(ek *proto.ExtentKey) {
i.insertEkRefMap(mp.config.PartitionId, ek)
}
if i.getVer() != mp.verSeq {
i.CreateVer(mp.verSeq)
}
i.Lock()
defer i.Unlock()
if err = i.CreateLowerVersion(i.getVer(), mp.multiVersionList); err != nil {
return
}
oldSize := int64(i.Size)
delExtents := i.ExtentsTruncate(ino.Size, ino.ModifyTime, doOnLastKey, insertSplitKey)
if len(delExtents) == 0 {
return
}
if delExtents, err = i.RestoreExts2NextLayer(mp.config.PartitionId, delExtents, mp.verSeq, 0); err != nil {
panic("RestoreExts2NextLayer should not be error")
}
mp.updateUsedInfo(int64(i.Size)-oldSize, 0, i.Inode)
// now we should delete the extent
log.LogInfof("fsmExtentsTruncate.mp (%v) inode[%v] DecSplitExts exts(%v)", mp.config.PartitionId, i.Inode, delExtents)
i.DecSplitExts(mp.config.PartitionId, delExtents)
mp.extDelCh <- delExtents
mp.uidManager.minusUidSpace(i.Uid, i.Inode, delExtents)
return
}
func (mp *metaPartition) fsmEvictInode(ino *Inode) (resp *InodeResponse) {
resp = NewInodeResponse()
log.LogDebugf("action[fsmEvictInode] inode[%v]", ino)
resp.Status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
resp.Status = proto.OpNotExistErr
return
}
i := item.(*Inode)
if i.ShouldDelete() {
log.LogDebugf("action[fsmEvictInode] inode[%v] already be mark delete", ino)
return
}
if proto.IsDir(i.Type) {
if i.IsEmptyDirAndNoSnapshot() {
i.SetDeleteMark()
}
return
}
if i.IsTempFile() {
log.LogDebugf("action[fsmEvictInode] inode[%v] already linke zero and be set mark delete and be put to freelist", ino)
if i.isEmptyVerList() {
i.SetDeleteMark()
mp.freeList.Push(i.Inode)
}
}
return
}
func (mp *metaPartition) fsmBatchEvictInode(ib InodeBatch) (resp []*InodeResponse) {
for _, ino := range ib {
status := mp.inodeInTx(ino.Inode)
if status != proto.OpOk {
resp = append(resp, &InodeResponse{Status: status})
return
}
resp = append(resp, mp.fsmEvictInode(ino))
}
return
}
func (mp *metaPartition) checkAndInsertFreeList(ino *Inode) {
if proto.IsDir(ino.Type) {
return
}
if ino.ShouldDelete() {
mp.freeList.Push(ino.Inode)
} else if ino.IsTempFile() {
ino.AccessTime = time.Now().Unix()
mp.freeList.Push(ino.Inode)
}
}
func (mp *metaPartition) fsmSetAttr(req *SetattrRequest) (err error) {
log.LogDebugf("action[fsmSetAttr] req %v", req)
ino := NewInode(req.Inode, req.Mode)
item := mp.inodeTree.CopyGet(ino)
if item == nil {
return
}
ino = item.(*Inode)
if ino.ShouldDelete() {
return
}
ino.SetAttr(req)
return
}
// fsmExtentsEmpty only use in datalake situation
func (mp *metaPartition) fsmExtentsEmpty(ino *Inode) (status uint8) {
status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
status = proto.OpNotExistErr
return
}
i := item.(*Inode)
if i.ShouldDelete() {
status = proto.OpNotExistErr
return
}
if proto.IsDir(i.Type) {
status = proto.OpArgMismatchErr
return
}
log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks len [%v]", mp.config.PartitionId, ino.Inode, len(i.Extents.eks))
tinyEks := i.CopyTinyExtents()
log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks tiny len [%v]", mp.config.PartitionId, ino.Inode, len(tinyEks))
if len(tinyEks) > 0 {
mp.extDelCh <- tinyEks
mp.uidManager.minusUidSpace(i.Uid, i.Inode, tinyEks)
log.LogDebugf("fsmExtentsEmpty mp[%v] inode[%d] tinyEks(%v)", mp.config.PartitionId, ino.Inode, tinyEks)
}
i.EmptyExtents(ino.ModifyTime)
return
}
// fsmExtentsEmpty only use in datalake situation
func (mp *metaPartition) fsmDelVerExtents(ino *Inode) (status uint8) {
status = proto.OpOk
item := mp.inodeTree.CopyGet(ino)
if item == nil {
status = proto.OpNotExistErr
return
}
i := item.(*Inode)
if i.ShouldDelete() {
status = proto.OpNotExistErr
return
}
if proto.IsDir(i.Type) {
status = proto.OpArgMismatchErr
return
}
log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks len [%v]", mp.config.PartitionId, ino.Inode, len(i.Extents.eks))
tinyEks := i.CopyTinyExtents()
log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks tiny len [%v]", mp.config.PartitionId, ino.Inode, len(tinyEks))
if len(tinyEks) > 0 {
mp.extDelCh <- tinyEks
log.LogDebugf("fsmExtentsEmpty mp[%v] inode[%d] tinyEks(%v)", mp.config.PartitionId, ino.Inode, tinyEks)
}
i.EmptyExtents(ino.ModifyTime)
return
}
func (mp *metaPartition) fsmClearInodeCache(ino *Inode) (status uint8) {
status = proto.OpOk
item := mp.inodeTree.Get(ino)
if item == nil {
status = proto.OpNotExistErr
return
}
ino2 := item.(*Inode)
if ino2.ShouldDelete() {
status = proto.OpNotExistErr
return
}
delExtents := ino2.EmptyExtents(ino.ModifyTime)
log.LogInfof("fsmClearInodeCache.mp[%v] inode[%v] DecSplitExts delExtents(%v)", mp.config.PartitionId, ino2.Inode, delExtents)
if len(delExtents) > 0 {
ino2.DecSplitExts(mp.config.PartitionId, delExtents)
mp.extDelCh <- delExtents
}
return
}
// attion: unmarshal error will disard extent
func (mp *metaPartition) fsmSendToChan(val []byte, v3 bool) (status uint8) {
sortExtents := NewSortedExtents()
// ek for del don't need version info
err, _ := sortExtents.UnmarshalBinary(val, v3)
if err != nil {
panic(fmt.Errorf("[fsmDelExtents] unmarshal sortExtents error, mp[%v], err(%s)", mp.config.PartitionId, err.Error()))
}
log.LogInfof("fsmDelExtents mp[%v] delExtents(%v)", mp.config.PartitionId, len(sortExtents.eks))
mp.extDelCh <- sortExtents.eks
return
}
func (mp *metaPartition) fsmSetInodeQuotaBatch(req *proto.BatchSetMetaserverQuotaReuqest) (resp *proto.BatchSetMetaserverQuotaResponse) {
var files int64
var bytes int64
resp = &proto.BatchSetMetaserverQuotaResponse{}
resp.InodeRes = make(map[uint64]uint8, 0)
for _, ino := range req.Inodes {
var isExist bool
var err error
extend := NewExtend(ino)
treeItem := mp.extendTree.Get(extend)
inode := NewInode(ino, 0)
retMsg := mp.getInode(inode, false)
if retMsg.Status != proto.OpOk {
log.LogErrorf("fsmSetInodeQuotaBatch get inode[%v] fail.", ino)
resp.InodeRes[ino] = retMsg.Status
continue
}
inode = retMsg.Msg
log.LogDebugf("fsmSetInodeQuotaBatch msg [%v] inode[%v]", retMsg, inode)
quotaInfos := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
quotaInfo := &proto.MetaQuotaInfo{
RootInode: req.IsRoot,
}
if treeItem == nil {
quotaInfos.QuotaInfoMap[req.QuotaId] = quotaInfo
mp.extendTree.ReplaceOrInsert(extend, true)
} else {
extend = treeItem.(*Extend)
value, exist := extend.Get([]byte(proto.QuotaKey))
if exist {
if err = json.Unmarshal(value, "aInfos.QuotaInfoMap); err != nil {
log.LogErrorf("set quota Unmarshal quotaInfos fail [%v]", err)
resp.InodeRes[ino] = proto.OpErr
continue
}
oldQuotaInfo, ok := quotaInfos.QuotaInfoMap[req.QuotaId]
if ok {
isExist = true
quotaInfo = oldQuotaInfo
}
}
quotaInfos.QuotaInfoMap[req.QuotaId] = quotaInfo
}
value, err := json.Marshal(quotaInfos.QuotaInfoMap)
if err != nil {
log.LogErrorf("set quota marsha1 quotaInfos [%v] fail [%v]", quotaInfos, err)
resp.InodeRes[ino] = proto.OpErr
continue
}
extend.Put([]byte(proto.QuotaKey), value, mp.verSeq)
resp.InodeRes[ino] = proto.OpOk
if !isExist {
files += 1
bytes += int64(inode.Size)
}
}
mp.mqMgr.updateUsedInfo(bytes, files, req.QuotaId)
log.LogInfof("fsmSetInodeQuotaBatch quotaId [%v] resp [%v] success.", req.QuotaId, resp)
return
}
func (mp *metaPartition) fsmDeleteInodeQuotaBatch(req *proto.BatchDeleteMetaserverQuotaReuqest) (resp *proto.BatchDeleteMetaserverQuotaResponse) {
var files int64
var bytes int64
resp = &proto.BatchDeleteMetaserverQuotaResponse{}
resp.InodeRes = make(map[uint64]uint8, 0)
for _, ino := range req.Inodes {
var err error
extend := NewExtend(ino)
treeItem := mp.extendTree.Get(extend)
inode := NewInode(ino, 0)
retMsg := mp.getInode(inode, false)
if retMsg.Status != proto.OpOk {
log.LogErrorf("fsmDeleteInodeQuotaBatch get inode[%v] fail.", ino)
resp.InodeRes[ino] = retMsg.Status
continue
}
inode = retMsg.Msg
log.LogDebugf("fsmDeleteInodeQuotaBatch msg [%v] inode[%v]", retMsg, inode)
quotaInfos := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
if treeItem == nil {
log.LogDebugf("fsmDeleteInodeQuotaBatch inode[%v] not has extend ", ino)
resp.InodeRes[ino] = proto.OpOk
continue
} else {
extend = treeItem.(*Extend)
value, exist := extend.Get([]byte(proto.QuotaKey))
if exist {
if err = json.Unmarshal(value, "aInfos.QuotaInfoMap); err != nil {
log.LogErrorf("fsmDeleteInodeQuotaBatch ino[%v] Unmarshal quotaInfos fail [%v]", ino, err)
resp.InodeRes[ino] = proto.OpErr
continue
}
_, ok := quotaInfos.QuotaInfoMap[req.QuotaId]
if ok {
delete(quotaInfos.QuotaInfoMap, req.QuotaId)
if len(quotaInfos.QuotaInfoMap) == 0 {
extend.Remove([]byte(proto.QuotaKey))
} else {
value, err = json.Marshal(quotaInfos.QuotaInfoMap)
if err != nil {
log.LogErrorf("fsmDeleteInodeQuotaBatch marsha1 quotaInfos [%v] fail [%v]", quotaInfos, err)
resp.InodeRes[ino] = proto.OpErr
continue
}
extend.Put([]byte(proto.QuotaKey), value, mp.verSeq)
}
} else {
log.LogDebugf("fsmDeleteInodeQuotaBatch QuotaInfoMap can not find inode[%v] quota [%v]", ino, req.QuotaId)
resp.InodeRes[ino] = proto.OpOk
continue
}
} else {
resp.InodeRes[ino] = proto.OpOk
continue
}
}
files -= 1
bytes -= int64(inode.Size)
}
mp.mqMgr.updateUsedInfo(bytes, files, req.QuotaId)
log.LogInfof("fsmDeleteInodeQuotaBatch quotaId [%v] resp [%v] success.", req.QuotaId, resp)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import "github.com/cubefs/cubefs/proto"
func (mp *metaPartition) fsmCreateMultipart(multipart *Multipart) (status uint8) {
_, ok := mp.multipartTree.ReplaceOrInsert(multipart, false)
if !ok {
return proto.OpExistErr
}
return proto.OpOk
}
func (mp *metaPartition) fsmRemoveMultipart(multipart *Multipart) (status uint8) {
deletedItem := mp.multipartTree.Delete(multipart)
if deletedItem == nil {
return proto.OpNotExistErr
}
return proto.OpOk
}
func (mp *metaPartition) fsmAppendMultipart(multipart *Multipart) (resp proto.AppendMultipartResponse) {
storedItem := mp.multipartTree.CopyGet(multipart)
if storedItem == nil {
resp.Status = proto.OpNotExistErr
return
}
storedMultipart, is := storedItem.(*Multipart)
if !is {
resp.Status = proto.OpNotExistErr
return
}
for _, part := range multipart.Parts() {
oldInode, updated, conflict := storedMultipart.UpdateOrStorePart(part)
if conflict {
resp.Status = proto.OpUploadPartConflictErr
return
}
if updated {
resp.OldInode = oldInode
resp.Update = true
}
}
resp.Status = proto.OpOk
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package metanode
import (
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) fsmTxRollback(txID string) (status uint8) {
status = mp.txProcessor.txManager.rollbackTxInfo(txID)
return
}
func (mp *metaPartition) fsmTxDelete(txID string) (status uint8) {
status = mp.txProcessor.txManager.deleteTxInfo(txID)
return
}
func (mp *metaPartition) fsmTxInodeRollback(req *proto.TxInodeApplyRequest) (status uint8) {
status, _ = mp.txProcessor.txResource.rollbackInode(req)
return
}
func (mp *metaPartition) fsmTxDentryRollback(req *proto.TxDentryApplyRequest) (status uint8) {
status, _ = mp.txProcessor.txResource.rollbackDentry(req)
return
}
func (mp *metaPartition) fsmTxSetState(req *proto.TxSetStateRequest) (status uint8) {
status, _ = mp.txProcessor.txManager.txSetState(req)
return
}
func (mp *metaPartition) fsmTxInit(txInfo *proto.TransactionInfo) (status uint8) {
status = proto.OpOk
err := mp.txProcessor.txManager.registerTransaction(txInfo)
if err != nil {
log.LogErrorf("fsmTxInit: register transaction failed, txInfo %s, err %s", txInfo.String(), err.Error())
return proto.OpTxInternalErr
}
return
}
func (mp *metaPartition) fsmTxCommit(txID string) (status uint8) {
status, _ = mp.txProcessor.txManager.commitTxInfo(txID)
return
}
func (mp *metaPartition) fsmTxInodeCommit(txID string, inode uint64) (status uint8) {
// var err error
status, _ = mp.txProcessor.txResource.commitInode(txID, inode)
return
}
func (mp *metaPartition) fsmTxDentryCommit(txID string, pId uint64, name string) (status uint8) {
// var err error
status, _ = mp.txProcessor.txResource.commitDentry(txID, pId, name)
return
}
func (mp *metaPartition) fsmTxCommitRM(txInfo *proto.TransactionInfo) (status uint8) {
status = proto.OpOk
ifo := mp.txProcessor.txManager.copyGetTx(txInfo.TxID)
if ifo == nil || ifo.Finish() {
log.LogWarnf("fsmTxCommitRM: tx already commit or rollback before, tx %v, ifo %v", txInfo, ifo)
return
}
mpId := mp.config.PartitionId
for _, ifo := range txInfo.TxInodeInfos {
if ifo.MpID != mpId {
continue
}
mp.fsmTxInodeCommit(ifo.TxID, ifo.Ino)
}
for _, ifo := range txInfo.TxDentryInfos {
if ifo.MpID != mpId {
continue
}
mp.fsmTxDentryCommit(ifo.TxID, ifo.ParentId, ifo.Name)
}
ifo.SetFinish()
return proto.OpOk
}
func (mp *metaPartition) fsmTxRollbackRM(txInfo *proto.TransactionInfo) (status uint8) {
status = proto.OpOk
ifo := mp.txProcessor.txManager.copyGetTx(txInfo.TxID)
if ifo == nil || ifo.Finish() {
log.LogWarnf("fsmTxRollbackRM: tx already commit or rollback before, tx %v, ifo %v", txInfo, ifo)
return
}
mpId := mp.config.PartitionId
for _, ifo := range txInfo.TxInodeInfos {
if ifo.MpID != mpId {
continue
}
req := &proto.TxInodeApplyRequest{
TxID: ifo.TxID,
Inode: ifo.Ino,
}
mp.fsmTxInodeRollback(req)
}
// delete from rb tree
for _, ifo := range txInfo.TxDentryInfos {
if ifo.MpID != mpId {
continue
}
req := &proto.TxDentryApplyRequest{
TxID: ifo.TxID,
Pid: ifo.ParentId,
Name: ifo.Name,
}
mp.fsmTxDentryRollback(req)
}
ifo.SetFinish()
return proto.OpOk
}
func (mp *metaPartition) inodeInTx(inode uint64) uint8 {
inTx, txId := mp.txProcessor.txResource.isInodeInTransction(NewInode(inode, 0))
if inTx {
log.LogWarnf("inodeInTx: inode is in transaction, inode %d, txId %s", inode, txId)
return proto.OpTxConflictErr
}
return proto.OpOk
}
func (mp *metaPartition) dentryInTx(parIno uint64, name string) uint8 {
inTx, txId := mp.txProcessor.txResource.isDentryInTransction(&Dentry{
ParentId: parIno,
Name: name,
})
if inTx {
log.LogWarnf("inodeInTx: inode is in transaction, parent inode %d, name %s, txId %s", parIno, name, txId)
return proto.OpTxConflictErr
}
return proto.OpOk
}
func (mp *metaPartition) txInodeInRb(inode uint64, newTxId string) (rbInode *TxRollbackInode) {
rbIno := mp.txProcessor.txResource.getTxRbInode(inode)
if rbIno != nil && rbIno.txInodeInfo.TxID == newTxId {
return rbIno
}
return nil
}
func (mp *metaPartition) txDentryInRb(parIno uint64, name, newTxId string) bool {
inTx, txId := mp.txProcessor.txResource.isDentryInTransction(&Dentry{
ParentId: parIno,
Name: name,
})
return inTx && txId == newTxId
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/binary"
"github.com/cubefs/cubefs/proto"
)
type fsmEvictUniqCheckerRequest struct {
Idx int
UniqID uint64
}
type UniqIdResp struct {
Start uint64
End uint64
Status uint8
}
func (mp *metaPartition) fsmUniqID(val []byte) (resp *UniqIdResp) {
resp = &UniqIdResp{
Status: proto.OpOk,
}
num := binary.BigEndian.Uint32(val)
resp.Start, resp.End = mp.allocateUniqID(num)
return resp
}
func (mp *metaPartition) fsmUniqCheckerEvict(req *fsmEvictUniqCheckerRequest) error {
mp.uniqChecker.doEvict(req.UniqID)
return nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"os"
"path"
"reflect"
"strings"
"sync"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
// MetaItem defines the structure of the metadata operations.
type MetaItem struct {
Op uint32 `json:"Op"`
K []byte `json:"k"`
V []byte `json:"v"`
}
// MarshalJson
func (s *MetaItem) MarshalJson() ([]byte, error) {
return json.Marshal(s)
}
// MarshalBinary marshals MetaItem to binary data.
// Binary frame structure:
// +------+----+------+------+------+------+
// | Item | Op | LenK | K | LenV | V |
// +------+----+------+------+------+------+
// | byte | 4 | 4 | LenK | 4 | LenV |
// +------+----+------+------+------+------+
func (s *MetaItem) MarshalBinary() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0))
buff.Grow(4 + len(s.K) + len(s.V))
if err = binary.Write(buff, binary.BigEndian, s.Op); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(s.K))); err != nil {
return
}
if _, err = buff.Write(s.K); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(s.V))); err != nil {
return
}
if _, err = buff.Write(s.V); err != nil {
return
}
result = buff.Bytes()
return
}
// UnmarshalJson unmarshals binary data to MetaItem.
func (s *MetaItem) UnmarshalJson(data []byte) error {
return json.Unmarshal(data, s)
}
// MarshalBinary unmarshal this MetaItem entity from binary data.
// Binary frame structure:
// +------+----+------+------+------+------+
// | Item | Op | LenK | K | LenV | V |
// +------+----+------+------+------+------+
// | byte | 4 | 4 | LenK | 4 | LenV |
// +------+----+------+------+------+------+
func (s *MetaItem) UnmarshalBinary(raw []byte) (err error) {
var (
lenK uint32
lenV uint32
)
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &s.Op); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &lenK); err != nil {
return
}
s.K = make([]byte, lenK)
if _, err = buff.Read(s.K); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &lenV); err != nil {
return
}
s.V = make([]byte, lenV)
if _, err = buff.Read(s.V); err != nil {
return
}
return
}
// NewMetaItem returns a new MetaItem.
func NewMetaItem(op uint32, key, value []byte) *MetaItem {
return &MetaItem{
Op: op,
K: key,
V: value,
}
}
type fileData struct {
filename string
data []byte
}
const (
// initial version
SnapFormatVersion_0 uint32 = iota
// version since transaction feature, added formatVersion, txId and cursor in MetaItemIterator struct
SnapFormatVersion_1
)
// MetaItemIterator defines the iterator of the MetaItem.
type MetaItemIterator struct {
fileRootDir string
SnapFormatVersion uint32
applyID uint64
uniqID uint64
txId uint64
cursor uint64
inodeTree *BTree
dentryTree *BTree
extendTree *BTree
multipartTree *BTree
txTree *BTree
txRbInodeTree *BTree
txRbDentryTree *BTree
uniqChecker *uniqChecker
verList []*proto.VolVersionInfo
filenames []string
dataCh chan interface{}
errorCh chan error
err error
closeCh chan struct{}
closeOnce sync.Once
}
// SnapItemWrapper key definition
const (
SiwKeySnapFormatVer uint32 = iota
SiwKeyApplyId
SiwKeyTxId
SiwKeyCursor
SiwKeyUniqId
SiwKeyVerList
)
type SnapItemWrapper struct {
key uint32
value interface{}
}
func (siw *SnapItemWrapper) MarshalKey() (k []byte) {
k = make([]byte, 8)
binary.BigEndian.PutUint32(k, siw.key)
return
}
func (siw *SnapItemWrapper) UnmarshalKey(k []byte) (err error) {
siw.key = binary.BigEndian.Uint32(k)
return
}
// newMetaItemIterator returns a new MetaItemIterator.
func newMetaItemIterator(mp *metaPartition) (si *MetaItemIterator, err error) {
si = new(MetaItemIterator)
si.fileRootDir = mp.config.RootDir
si.SnapFormatVersion = mp.manager.metaNode.raftSyncSnapFormatVersion
mp.nonIdempotent.Lock()
si.applyID = mp.getApplyID()
si.txId = mp.txProcessor.txManager.txIdAlloc.getTransactionID()
si.cursor = mp.GetCursor()
si.uniqID = mp.GetUniqId()
si.inodeTree = mp.inodeTree.GetTree()
si.dentryTree = mp.dentryTree.GetTree()
si.extendTree = mp.extendTree.GetTree()
si.multipartTree = mp.multipartTree.GetTree()
si.txTree = mp.txProcessor.txManager.txTree.GetTree()
si.txRbInodeTree = mp.txProcessor.txResource.txRbInodeTree.GetTree()
si.txRbDentryTree = mp.txProcessor.txResource.txRbDentryTree.GetTree()
si.uniqChecker = mp.uniqChecker.clone()
si.verList = mp.GetAllVerList()
mp.nonIdempotent.Unlock()
si.dataCh = make(chan interface{})
si.errorCh = make(chan error, 1)
si.closeCh = make(chan struct{})
// collect extend del files
filenames := make([]string, 0)
var fileInfos []os.DirEntry
if fileInfos, err = os.ReadDir(mp.config.RootDir); err != nil {
return
}
for _, fileInfo := range fileInfos {
if !fileInfo.IsDir() && strings.HasPrefix(fileInfo.Name(), prefixDelExtent) {
filenames = append(filenames, fileInfo.Name())
}
if !fileInfo.IsDir() && strings.HasPrefix(fileInfo.Name(), prefixDelExtentV2) {
filenames = append(filenames, fileInfo.Name())
}
}
si.filenames = filenames
// start data producer
go func(iter *MetaItemIterator) {
defer func() {
close(iter.dataCh)
close(iter.errorCh)
}()
produceItem := func(item interface{}) (success bool) {
select {
case iter.dataCh <- item:
return true
case <-iter.closeCh:
return false
}
}
produceError := func(err error) {
select {
case iter.errorCh <- err:
default:
}
}
checkClose := func() (closed bool) {
select {
case <-iter.closeCh:
return true
default:
return false
}
}
if si.SnapFormatVersion == SnapFormatVersion_0 {
// process index ID
produceItem(si.applyID)
log.LogDebugf("newMetaItemIterator: SnapFormatVersion_0, partitionId(%v), applyID(%v)",
mp.config.PartitionId, si.applyID)
} else if si.SnapFormatVersion == SnapFormatVersion_1 {
// process snapshot format version
snapFormatVerWrapper := SnapItemWrapper{SiwKeySnapFormatVer, si.SnapFormatVersion}
produceItem(snapFormatVerWrapper)
// process apply index ID
applyIdWrapper := SnapItemWrapper{SiwKeyApplyId, si.applyID}
produceItem(applyIdWrapper)
// process txId
txIdWrapper := SnapItemWrapper{SiwKeyTxId, si.txId}
produceItem(txIdWrapper)
// process cursor
cursorWrapper := SnapItemWrapper{SiwKeyCursor, si.cursor}
produceItem(cursorWrapper)
verListWrapper := SnapItemWrapper{SiwKeyVerList, si.verList}
produceItem(verListWrapper)
log.LogDebugf("newMetaItemIterator: SnapFormatVersion_1, partitionId(%v) applyID(%v) txId(%v) cursor(%v) uniqID(%v) verList(%v)",
mp.config.PartitionId, si.applyID, si.txId, si.cursor, si.uniqID, si.verList)
if si.uniqID != 0 {
// process uniqId
uniqIdWrapper := SnapItemWrapper{SiwKeyUniqId, si.uniqID}
produceItem(uniqIdWrapper)
}
} else {
panic(fmt.Sprintf("invalid raftSyncSnapFormatVersione: %v", si.SnapFormatVersion))
}
// process inodes
iter.inodeTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
// process dentries
iter.dentryTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
// process extends
iter.extendTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
// process multiparts
iter.multipartTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
if si.SnapFormatVersion == SnapFormatVersion_1 {
iter.txTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
iter.txRbInodeTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
iter.txRbDentryTree.Ascend(func(i BtreeItem) bool {
return produceItem(i)
})
if checkClose() {
return
}
if si.uniqID != 0 {
produceItem(si.uniqChecker)
if checkClose() {
return
}
}
}
// process extent del files
var err error
var raw []byte
for _, filename := range iter.filenames {
if raw, err = os.ReadFile(path.Join(iter.fileRootDir, filename)); err != nil {
produceError(err)
return
}
if !produceItem(&fileData{filename: filename, data: raw}) {
return
}
}
}(si)
return
}
// ApplyIndex returns the applyID of the iterator.
func (si *MetaItemIterator) ApplyIndex() uint64 {
return si.applyID
}
// Close closes the iterator.
func (si *MetaItemIterator) Close() {
si.closeOnce.Do(func() {
close(si.closeCh)
})
return
}
// Next returns the next item.
func (si *MetaItemIterator) Next() (data []byte, err error) {
if si.err != nil {
err = si.err
return
}
var item interface{}
var open bool
select {
case item, open = <-si.dataCh:
case err, open = <-si.errorCh:
}
if item == nil || !open {
err, si.err = io.EOF, io.EOF
si.Close()
return
}
if err != nil {
si.err = err
si.Close()
return
}
var snap *MetaItem
switch typedItem := item.(type) {
case uint64:
applyIDBuf := make([]byte, 8)
binary.BigEndian.PutUint64(applyIDBuf, si.applyID)
data = applyIDBuf
return
case SnapItemWrapper:
if typedItem.key == SiwKeySnapFormatVer {
snapFormatVerBuf := make([]byte, 8)
binary.BigEndian.PutUint32(snapFormatVerBuf, si.SnapFormatVersion)
snap = NewMetaItem(opFSMSnapFormatVersion, typedItem.MarshalKey(), snapFormatVerBuf)
} else if typedItem.key == SiwKeyApplyId {
applyIDBuf := make([]byte, 8)
binary.BigEndian.PutUint64(applyIDBuf, si.applyID)
snap = NewMetaItem(opFSMApplyId, typedItem.MarshalKey(), applyIDBuf)
} else if typedItem.key == SiwKeyTxId {
txIDBuf := make([]byte, 8)
binary.BigEndian.PutUint64(txIDBuf, si.txId)
snap = NewMetaItem(opFSMTxId, typedItem.MarshalKey(), txIDBuf)
} else if typedItem.key == SiwKeyCursor {
cursor := typedItem.value.(uint64)
cursorBuf := make([]byte, 8)
binary.BigEndian.PutUint64(cursorBuf, cursor)
snap = NewMetaItem(opFSMCursor, typedItem.MarshalKey(), cursorBuf)
} else if typedItem.key == SiwKeyUniqId {
uniqId := typedItem.value.(uint64)
uniqIdBuf := make([]byte, 8)
binary.BigEndian.PutUint64(uniqIdBuf, uniqId)
snap = NewMetaItem(opFSMUniqIDSnap, typedItem.MarshalKey(), uniqIdBuf)
} else if typedItem.key == SiwKeyVerList {
var verListBuf []byte
if verListBuf, err = json.Marshal(typedItem.value.([]*proto.VolVersionInfo)); err != nil {
return
}
snap = NewMetaItem(opFSMVerListSnapShot, typedItem.MarshalKey(), verListBuf)
log.LogInfof("snapshot.fileRootDir %v verList %v", si.fileRootDir, verListBuf)
} else {
panic(fmt.Sprintf("MetaItemIterator.Next: unknown SnapItemWrapper key: %v", typedItem.key))
}
case *Inode:
snap = NewMetaItem(opFSMCreateInode, typedItem.MarshalKey(), typedItem.MarshalValue())
case *Dentry:
snap = NewMetaItem(opFSMCreateDentry, typedItem.MarshalKey(), typedItem.MarshalValue())
case *Extend:
var raw []byte
if raw, err = typedItem.Bytes(); err != nil {
si.err = err
si.Close()
return
}
snap = NewMetaItem(opFSMSetXAttr, nil, raw)
case *Multipart:
var raw []byte
if raw, err = typedItem.Bytes(); err != nil {
si.err = err
si.Close()
return
}
snap = NewMetaItem(opFSMCreateMultipart, nil, raw)
case *proto.TransactionInfo:
val, _ := typedItem.Marshal()
snap = NewMetaItem(opFSMTxSnapshot, []byte(typedItem.TxID), val)
case *TxRollbackInode:
val, _ := typedItem.Marshal()
snap = NewMetaItem(opFSMTxRbInodeSnapshot, typedItem.inode.MarshalKey(), val)
case *TxRollbackDentry:
val, _ := typedItem.Marshal()
snap = NewMetaItem(opFSMTxRbDentrySnapshot, []byte(typedItem.txDentryInfo.GetKey()), val)
case *fileData:
snap = NewMetaItem(opExtentFileSnapshot, []byte(typedItem.filename), typedItem.data)
case *uniqChecker:
var raw []byte
if raw, _, err = typedItem.Marshal(); err != nil {
si.err = err
si.Close()
return
}
snap = NewMetaItem(opFSMUniqCheckerSnap, nil, raw)
default:
panic(fmt.Sprintf("unknown item type: %v", reflect.TypeOf(item).Name()))
}
if data, err = snap.MarshalBinary(); err != nil {
si.err = err
si.Close()
return
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"fmt"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/auditlog"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) TxCreateDentry(req *proto.TxCreateDentryRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
if req.ParentID == req.Inode {
err = fmt.Errorf("parentId is equal inodeId")
p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
return
}
for _, quotaId := range req.QuotaIds {
status := mp.mqMgr.IsOverQuota(false, true, quotaId)
if status != 0 {
err = errors.New("create dentry is over quota")
reply := []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
}
var parIno *Inode
item := mp.inodeTree.Get(NewInode(req.ParentID, 0))
if item == nil {
err = fmt.Errorf("parent inode not exists")
p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
return
}
parIno = item.(*Inode)
quota := atomic.LoadUint32(&dirChildrenNumLimit)
if parIno.NLink >= quota {
err = fmt.Errorf("parent dir quota limitation reached")
p.PacketErrorWithBody(proto.OpDirQuota, []byte(err.Error()))
return
}
txInfo := req.TxInfo.GetCopy()
txDentry := NewTxDentry(req.ParentID, req.Name, req.Inode, req.Mode, parIno, txInfo)
val, err := txDentry.Marshal()
if err != nil {
return
}
status, err := mp.submit(opFSMTxCreateDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.ResultCode = status.(uint8)
return
}
// CreateDentry returns a new dentry.
func (mp *metaPartition) CreateDentry(req *CreateDentryReq, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
}()
}
if req.ParentID == req.Inode {
err = fmt.Errorf("parentId is equal inodeId")
p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
return
}
item := mp.inodeTree.CopyGet(NewInode(req.ParentID, 0))
if item == nil {
err = fmt.Errorf("parent inode not exists")
p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
return
} else {
parIno := item.(*Inode)
quota := atomic.LoadUint32(&dirChildrenNumLimit)
if parIno.NLink >= quota {
err = fmt.Errorf("parent dir quota limitation reached")
p.PacketErrorWithBody(proto.OpDirQuota, []byte(err.Error()))
return
}
}
dentry := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
Inode: req.Inode,
Type: req.Mode,
multiSnap: NewDentrySnap(mp.GetVerSeq()),
}
val, err := dentry.Marshal()
if err != nil {
return
}
resp, err := mp.submit(opFSMCreateDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.ResultCode = resp.(uint8)
return
}
func (mp *metaPartition) QuotaCreateDentry(req *proto.QuotaCreateDentryRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
}()
}
if req.ParentID == req.Inode {
err = fmt.Errorf("parentId is equal inodeId")
p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
return
}
for _, quotaId := range req.QuotaIds {
status := mp.mqMgr.IsOverQuota(false, true, quotaId)
if status != 0 {
err = errors.New("create dentry is over quota")
reply := []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
}
item := mp.inodeTree.CopyGet(NewInode(req.ParentID, 0))
if item == nil {
err = fmt.Errorf("parent inode not exists")
p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
return
} else {
parIno := item.(*Inode)
quota := atomic.LoadUint32(&dirChildrenNumLimit)
if parIno.NLink >= quota {
err = fmt.Errorf("parent dir quota limitation reached")
p.PacketErrorWithBody(proto.OpDirQuota, []byte(err.Error()))
return
}
}
dentry := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
Inode: req.Inode,
Type: req.Mode,
}
dentry.setVerSeq(mp.verSeq)
log.LogDebugf("action[CreateDentry] mp[%v] with seq [%v],dentry [%v]", mp.config.PartitionId, mp.verSeq, dentry)
val, err := dentry.Marshal()
if err != nil {
return
}
resp, err := mp.submit(opFSMCreateDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.ResultCode = resp.(uint8)
return
}
func (mp *metaPartition) TxDeleteDentry(req *proto.TxDeleteDentryRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Ino, req.ParentID)
}()
}
txInfo := req.TxInfo.GetCopy()
den := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
}
defer func() {
if p.ResultCode == proto.OpOk {
var reply []byte
resp := &proto.TxDeleteDentryResponse{
Inode: req.Ino,
}
reply, err = json.Marshal(resp)
p.PacketOkWithBody(reply)
}
}()
dentry, status := mp.getDentry(den)
if status != proto.OpOk {
if mp.txDentryInRb(req.ParentID, req.Name, req.TxInfo.TxID) {
p.ResultCode = proto.OpOk
log.LogWarnf("TxDeleteDentry: dentry is already been deleted before, req %v", req)
return
}
err = fmt.Errorf("dentry[%v] not exists", den)
log.LogWarn(err)
p.PacketErrorWithBody(status, []byte(err.Error()))
return
}
if dentry.Inode != req.Ino {
err = fmt.Errorf("target name ino is not right, par %d, name %s, want %d, got %d",
req.PartitionID, req.Name, req.Ino, dentry.Inode)
log.LogWarn(err)
p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
return
}
parIno := NewInode(req.ParentID, 0)
inoResp := mp.getInode(parIno, false)
if inoResp.Status != proto.OpOk {
err = fmt.Errorf("parIno[%v] not exists", parIno.Inode)
p.PacketErrorWithBody(inoResp.Status, []byte(err.Error()))
return
}
txDentry := &TxDentry{
// ParInode: inoResp.Msg,
Dentry: dentry,
TxInfo: txInfo,
}
val, err := txDentry.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
r, err := mp.submit(opFSMTxDeleteDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
retMsg := r.(*DentryResponse)
p.ResultCode = retMsg.Status
return
}
// DeleteDentry deletes a dentry.
func (mp *metaPartition) DeleteDentry(req *DeleteDentryReq, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), 0, req.ParentID)
}()
}
if req.InodeCreateTime > 0 {
if mp.vol.volDeleteLockTime > 0 && req.InodeCreateTime+mp.vol.volDeleteLockTime*60*60 > time.Now().Unix() {
err = errors.NewErrorf("the current Inode[%v] is still locked for deletion", req.Name)
log.LogDebugf("DeleteDentry: the current Inode is still locked for deletion, inode[%v] createTime(%v) mw.volDeleteLockTime(%v) now(%v)", req.Name, req.InodeCreateTime, mp.vol.volDeleteLockTime, time.Now().Unix())
p.PacketErrorWithBody(proto.OpNotPerm, []byte(err.Error()))
return
}
}
dentry := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
}
dentry.setVerSeq(req.Verseq)
log.LogDebugf("action[DeleteDentry] den param(%v)", dentry)
val, err := dentry.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
if mp.verSeq == 0 && dentry.getSeqFiled() > 0 {
err = fmt.Errorf("snapshot not enabled")
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
log.LogDebugf("action[DeleteDentry] submit!")
r, err := mp.submit(opFSMDeleteDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
retMsg := r.(*DentryResponse)
p.ResultCode = retMsg.Status
dentry = retMsg.Msg
if p.ResultCode == proto.OpOk {
var reply []byte
resp := &DeleteDentryResp{
Inode: dentry.Inode,
}
reply, err = json.Marshal(resp)
p.PacketOkWithBody(reply)
}
return
}
// DeleteDentry deletes a dentry.
func (mp *metaPartition) DeleteDentryBatch(req *BatchDeleteDentryReq, p *Packet, remoteAddr string) (err error) {
db := make(DentryBatch, 0, len(req.Dens))
start := time.Now()
for i, d := range req.Dens {
db = append(db, &Dentry{
ParentId: req.ParentID,
Name: d.Name,
Inode: d.Inode,
Type: d.Type,
})
den := &d
fullPath := ""
if len(req.FullPaths) > i {
fullPath = req.FullPaths[i]
}
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), den.Name, fullPath, err, time.Since(start).Milliseconds(), den.Inode, req.ParentID)
}()
}
}
val, err := db.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
r, err := mp.submit(opFSMDeleteDentryBatch, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
retMsg := r.([]*DentryResponse)
p.ResultCode = proto.OpOk
bddr := &BatchDeleteDentryResp{}
for _, m := range retMsg {
if m.Status != proto.OpOk {
p.ResultCode = proto.OpErr
}
if dentry := m.Msg; dentry != nil {
bddr.Items = append(bddr.Items, &struct {
Inode uint64 `json:"ino"`
Status uint8 `json:"status"`
}{
Inode: dentry.Inode,
Status: m.Status,
})
} else {
bddr.Items = append(bddr.Items, &struct {
Inode uint64 `json:"ino"`
Status uint8 `json:"status"`
}{
Status: m.Status,
})
}
}
reply, err := json.Marshal(bddr)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
p.PacketOkWithBody(reply)
return
}
func (mp *metaPartition) TxUpdateDentry(req *proto.TxUpdateDentryRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
}()
}
if req.ParentID == req.Inode {
err = fmt.Errorf("parentId is equal inodeId")
p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
return
}
txInfo := req.TxInfo.GetCopy()
defer func() {
if p.ResultCode == proto.OpOk {
var reply []byte
m := &proto.TxUpdateDentryResponse{
Inode: req.OldIno,
}
reply, _ = json.Marshal(m)
p.PacketOkWithBody(reply)
}
}()
newDentry := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
Inode: req.Inode,
}
oldDentry, status := mp.getDentry(newDentry)
if status != proto.OpOk {
if mp.txDentryInRb(req.ParentID, req.Name, req.TxInfo.TxID) {
p.ResultCode = proto.OpOk
log.LogWarnf("TxDeleteDentry: dentry is already been deleted before, req %v", req)
return
}
err = fmt.Errorf("oldDentry[%v] not exists", oldDentry)
p.PacketErrorWithBody(status, []byte(err.Error()))
return
}
if oldDentry.Inode != req.OldIno {
err = fmt.Errorf("oldDentry is alredy updated, req %v, old [%v]", req, oldDentry)
p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
return
}
txDentry := &TxUpdateDentry{
OldDentry: oldDentry,
NewDentry: newDentry,
TxInfo: txInfo,
}
val, err := txDentry.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMTxUpdateDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
msg := resp.(*DentryResponse)
p.ResultCode = msg.Status
return
}
// UpdateDentry updates a dentry.
func (mp *metaPartition) UpdateDentry(req *UpdateDentryReq, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
}()
}
if req.ParentID == req.Inode {
err = fmt.Errorf("parentId is equal inodeId")
p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
return
}
dentry := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
Inode: req.Inode,
}
dentry.setVerSeq(mp.verSeq)
val, err := dentry.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMUpdateDentry, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
msg := resp.(*DentryResponse)
p.ResultCode = msg.Status
if msg.Status == proto.OpOk {
var reply []byte
m := &UpdateDentryResp{
Inode: msg.Msg.Inode,
}
reply, err = json.Marshal(m)
p.PacketOkWithBody(reply)
}
return
}
func (mp *metaPartition) ReadDirOnly(req *ReadDirOnlyReq, p *Packet) (err error) {
resp := mp.readDirOnly(req)
reply, err := json.Marshal(resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
// ReadDir reads the directory based on the given request.
func (mp *metaPartition) ReadDir(req *ReadDirReq, p *Packet) (err error) {
resp := mp.readDir(req)
reply, err := json.Marshal(resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
func (mp *metaPartition) ReadDirLimit(req *ReadDirLimitReq, p *Packet) (err error) {
log.LogInfof("action[ReadDirLimit] read seq [%v], request[%v]", req.VerSeq, req)
resp := mp.readDirLimit(req)
reply, err := json.Marshal(resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
// Lookup looks up the given dentry from the request.
func (mp *metaPartition) Lookup(req *LookupReq, p *Packet) (err error) {
dentry := &Dentry{
ParentId: req.ParentID,
Name: req.Name,
}
dentry.setVerSeq(req.VerSeq)
var denList []proto.DetryInfo
if req.VerAll {
denList = mp.getDentryList(dentry)
}
dentry, status := mp.getDentry(dentry)
var reply []byte
if status == proto.OpOk || req.VerAll {
var resp *LookupResp
if status == proto.OpOk {
resp = &LookupResp{
Inode: dentry.Inode,
Mode: dentry.Type,
VerSeq: dentry.getSeqFiled(),
LayAll: denList,
}
} else {
resp = &LookupResp{
Inode: 0,
Mode: 0,
VerSeq: 0,
LayAll: denList,
}
}
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return
}
// GetDentryTree returns the dentry tree stored in the meta partition.
func (mp *metaPartition) GetDentryTree() *BTree {
return mp.dentryTree.GetTree()
}
// GetDentryTreeLen returns the dentry tree length.
func (mp *metaPartition) GetDentryTreeLen() int {
if mp.dentryTree == nil {
return 0
}
return mp.dentryTree.Len()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"strconv"
"strings"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) UpdateXAttr(req *proto.UpdateXAttrRequest, p *Packet) (err error) {
newValueList := strings.Split(req.Value, ",")
if len(newValueList) < 3 {
err = errors.New("Wrong number of parameters")
log.LogErrorf("action[UpdateXAttr],Wrong number of parameters")
p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
return
}
filesInc, err := strconv.ParseInt(newValueList[0], 10, 64)
if err != nil {
log.LogErrorf("action[UpdateXAttr],The parameter must be an integer: err(%v)", err)
p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
return
}
dirsInc, err := strconv.ParseInt(newValueList[1], 10, 64)
if err != nil {
log.LogErrorf("action[UpdateXAttr],The parameter must be an integer: err(%v)", err)
p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
return
}
bytesInc, err := strconv.ParseInt(newValueList[2], 10, 64)
if err != nil {
log.LogErrorf("action[UpdateXAttr],The parameter must be an integer: err(%v)", err)
p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
return
}
mp.xattrLock.Lock()
defer mp.xattrLock.Unlock()
treeItem := mp.extendTree.Get(NewExtend(req.Inode))
if treeItem != nil {
extend := treeItem.(*Extend)
if value, exist := extend.Get([]byte(req.Key)); exist {
oldValueList := strings.Split(string(value), ",")
oldFiles, _ := strconv.ParseInt(oldValueList[0], 10, 64)
oldDirs, _ := strconv.ParseInt(oldValueList[1], 10, 64)
oldBytes, _ := strconv.ParseInt(oldValueList[2], 10, 64)
newFiles := oldFiles + filesInc
newDirs := oldDirs + dirsInc
newBytes := oldBytes + bytesInc
newValue := strconv.FormatInt(int64(newFiles), 10) + "," +
strconv.FormatInt(int64(newDirs), 10) + "," +
strconv.FormatInt(int64(newBytes), 10)
extend := NewExtend(req.Inode)
extend.Put([]byte(req.Key), []byte(newValue), mp.verSeq)
if _, err = mp.putExtend(opFSMUpdateXAttr, extend); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkReply()
return
} else {
extend.Put([]byte(req.Key), []byte(req.Value), mp.verSeq)
if _, err = mp.putExtend(opFSMUpdateXAttr, extend); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
} else {
extend := NewExtend(req.Inode)
extend.Put([]byte(req.Key), []byte(req.Value), mp.verSeq)
if _, err = mp.putExtend(opFSMUpdateXAttr, extend); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
}
func (mp *metaPartition) SetXAttr(req *proto.SetXAttrRequest, p *Packet) (err error) {
extend := NewExtend(req.Inode)
extend.Put([]byte(req.Key), []byte(req.Value), mp.verSeq)
if _, err = mp.putExtend(opFSMSetXAttr, extend); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
func (mp *metaPartition) BatchSetXAttr(req *proto.BatchSetXAttrRequest, p *Packet) (err error) {
extend := NewExtend(req.Inode)
for key, val := range req.Attrs {
extend.Put([]byte(key), []byte(val), mp.verSeq)
}
if _, err = mp.putExtend(opFSMSetXAttr, extend); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
func (mp *metaPartition) GetXAttr(req *proto.GetXAttrRequest, p *Packet) (err error) {
response := &proto.GetXAttrResponse{
VolName: req.VolName,
PartitionId: req.PartitionId,
Inode: req.Inode,
Key: req.Key,
}
treeItem := mp.extendTree.Get(NewExtend(req.Inode))
if treeItem != nil {
if extend := treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
if value, exist := extend.Get([]byte(req.Key)); exist {
response.Value = string(value)
}
}
}
var encoded []byte
encoded, err = json.Marshal(response)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(encoded)
return
}
func (mp *metaPartition) GetAllXAttr(req *proto.GetAllXAttrRequest, p *Packet) (err error) {
response := &proto.GetAllXAttrResponse{
VolName: req.VolName,
PartitionId: req.PartitionId,
Inode: req.Inode,
Attrs: make(map[string]string),
}
treeItem := mp.extendTree.Get(NewExtend(req.Inode))
if treeItem != nil {
if extend := treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
for key, val := range extend.dataMap {
response.Attrs[key] = string(val)
}
}
}
var encoded []byte
encoded, err = json.Marshal(response)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(encoded)
return
}
func (mp *metaPartition) BatchGetXAttr(req *proto.BatchGetXAttrRequest, p *Packet) (err error) {
response := &proto.BatchGetXAttrResponse{
VolName: req.VolName,
PartitionId: req.PartitionId,
XAttrs: make([]*proto.XAttrInfo, 0, len(req.Inodes)),
}
for _, inode := range req.Inodes {
treeItem := mp.extendTree.Get(NewExtend(inode))
if treeItem != nil {
info := &proto.XAttrInfo{
Inode: inode,
XAttrs: make(map[string]string),
}
var extend *Extend
if extend = treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
for _, key := range req.Keys {
if val, exist := extend.Get([]byte(key)); exist {
info.XAttrs[key] = string(val)
}
}
}
response.XAttrs = append(response.XAttrs, info)
}
}
var encoded []byte
if encoded, err = json.Marshal(response); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(encoded)
return
}
func (mp *metaPartition) RemoveXAttr(req *proto.RemoveXAttrRequest, p *Packet) (err error) {
extend := NewExtend(req.Inode)
extend.Put([]byte(req.Key), nil, req.VerSeq)
if _, err = mp.putExtend(opFSMRemoveXAttr, extend); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
func (mp *metaPartition) ListXAttr(req *proto.ListXAttrRequest, p *Packet) (err error) {
response := &proto.ListXAttrResponse{
VolName: req.VolName,
PartitionId: req.PartitionId,
Inode: req.Inode,
XAttrs: make([]string, 0),
}
treeItem := mp.extendTree.Get(NewExtend(req.Inode))
if treeItem != nil {
if extend := treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
extend.Range(func(key, value []byte) bool {
response.XAttrs = append(response.XAttrs, string(key))
return true
})
}
}
var encoded []byte
encoded, err = json.Marshal(response)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(encoded)
return
}
func (mp *metaPartition) putExtend(op uint32, extend *Extend) (resp interface{}, err error) {
var marshaled []byte
if marshaled, err = extend.Bytes(); err != nil {
return
}
resp, err = mp.submit(op, marshaled)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"fmt"
"os"
"sort"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/auditlog"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) CheckQuota(inodeId uint64, p *Packet) (iParm *Inode, inode *Inode, err error) {
iParm = NewInode(inodeId, 0)
status := mp.isOverQuota(inodeId, true, false)
if status != 0 {
log.LogErrorf("CheckQuota dir quota fail inode[%v] status [%v]", inodeId, status)
err = errors.New("CheckQuota dir quota is over quota")
reply := []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
item := mp.inodeTree.Get(iParm)
if item == nil {
err = fmt.Errorf("inode[%v] not exist", iParm)
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
inode = item.(*Inode)
mp.uidManager.acLock.Lock()
if mp.uidManager.getUidAcl(inode.Uid) {
log.LogWarnf("CheckQuota UidSpace.volname [%v] mp[%v] uid %v be set full", mp.uidManager.mpID, mp.uidManager.volName, inode.Uid)
mp.uidManager.acLock.Unlock()
status = proto.OpNoSpaceErr
err = errors.New("CheckQuota UidSpace is over quota")
reply := []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
mp.uidManager.acLock.Unlock()
return
}
// ExtentAppend appends an extent.
func (mp *metaPartition) ExtentAppend(req *proto.AppendExtentKeyRequest, p *Packet) (err error) {
if !proto.IsHot(mp.volType) {
err = fmt.Errorf("only support hot vol")
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
ino := NewInode(req.Inode, 0)
if _, _, err = mp.CheckQuota(req.Inode, p); err != nil {
log.LogErrorf("ExtentAppend fail status [%v]", err)
return
}
ext := req.Extent
ino.Extents.Append(ext)
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMExtentsAdd, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketErrorWithBody(resp.(uint8), nil)
return
}
// ExtentAppendWithCheck appends an extent with discard extents check.
// Format: one valid extent key followed by non or several discard keys.
func (mp *metaPartition) ExtentAppendWithCheck(req *proto.AppendExtentKeyWithCheckRequest, p *Packet) (err error) {
status := mp.isOverQuota(req.Inode, true, false)
if status != 0 {
log.LogErrorf("ExtentAppendWithCheck fail status [%v]", status)
err = errors.New("ExtentAppendWithCheck is over quota")
reply := []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
var (
inoParm *Inode
i *Inode
)
if inoParm, i, err = mp.CheckQuota(req.Inode, p); err != nil {
log.LogErrorf("ExtentAppendWithCheck CheckQuota fail err [%v]", err)
return
}
// check volume's Type: if volume's type is cold, cbfs' extent can be modify/add only when objextent exist
if proto.IsCold(mp.volType) {
i.RLock()
exist, idx := i.ObjExtents.FindOffsetExist(req.Extent.FileOffset)
if !exist {
i.RUnlock()
err = fmt.Errorf("ebs's objextent not exist with offset[%v]", req.Extent.FileOffset)
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
if i.ObjExtents.eks[idx].Size != uint64(req.Extent.Size) {
err = fmt.Errorf("ebs's objextent size[%v] isn't equal to the append size[%v]", i.ObjExtents.eks[idx].Size, req.Extent.Size)
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
i.RUnlock()
return
}
i.RUnlock()
}
ext := req.Extent
// extent key verSeq not set value since marshal will not include verseq
// use inode verSeq instead
inoParm.setVer(mp.verSeq)
inoParm.Extents.Append(ext)
log.LogDebugf("ExtentAppendWithCheck: ino(%v) mp[%v] verSeq (%v)", req.Inode, req.PartitionID, mp.verSeq)
// Store discard extents right after the append extent key.
if len(req.DiscardExtents) != 0 {
inoParm.Extents.eks = append(inoParm.Extents.eks, req.DiscardExtents...)
}
val, err := inoParm.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
var opFlag uint32 = opFSMExtentsAddWithCheck
if req.IsSplit {
opFlag = opFSMExtentSplit
}
resp, err := mp.submit(opFlag, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
log.LogDebugf("ExtentAppendWithCheck: ino(%v) mp[%v] verSeq (%v) req.VerSeq(%v) rspcode(%v)", req.Inode, req.PartitionID, mp.verSeq, req.VerSeq, resp.(uint8))
if mp.verSeq > req.VerSeq {
// reuse ExtentType to identify flag of version inconsistent between metanode and client
// will resp to client and make client update all streamer's extent and it's verSeq
p.ExtentType |= proto.MultiVersionFlag
p.VerSeq = mp.verSeq
}
p.PacketErrorWithBody(resp.(uint8), nil)
return
}
func (mp *metaPartition) SetTxInfo(info []*proto.TxInfo) {
for _, txInfo := range info {
if txInfo.Volume != mp.config.VolName {
continue
}
mp.txProcessor.mask = txInfo.Mask
mp.txProcessor.txManager.setLimit(txInfo.OpLimitVal)
log.LogInfof("SetTxInfo mp[%v] mask %v limit %v", mp.config.PartitionId, proto.GetMaskString(txInfo.Mask), txInfo.OpLimitVal)
}
}
type VerOpData struct {
Op uint8
VerSeq uint64
VerList []*proto.VolVersionInfo
}
func (mp *metaPartition) checkByMasterVerlist(mpVerList *proto.VolVersionInfoList, masterVerList *proto.VolVersionInfoList) (err error) {
currMasterSeq := masterVerList.GetLastVer()
verMapMaster := make(map[uint64]*proto.VolVersionInfo)
for _, ver := range masterVerList.VerList {
verMapMaster[ver.Ver] = ver
}
log.LogDebugf("checkVerList. volname [%v] mp[%v] masterVerList %v mpVerList.VerList %v", mp.config.VolName, mp.config.PartitionId, masterVerList, mpVerList.VerList)
mp.multiVersionList.RWLock.Lock()
defer mp.multiVersionList.RWLock.Unlock()
vlen := len(mpVerList.VerList)
for id, info2 := range mpVerList.VerList {
if id == vlen-1 {
break
}
log.LogDebugf("checkVerList. volname [%v] mp[%v] ver info %v currMasterseq [%v]", mp.config.VolName, mp.config.PartitionId, info2, currMasterSeq)
_, exist := verMapMaster[info2.Ver]
if !exist {
if _, ok := mp.multiVersionList.TemporaryVerMap[info2.Ver]; !ok {
log.LogInfof("checkVerList. volname [%v] mp[%v] ver info %v be consider as TemporaryVer", mp.config.VolName, mp.config.PartitionId, info2)
mp.multiVersionList.TemporaryVerMap[info2.Ver] = info2
}
}
}
for verSeq := range mp.multiVersionList.TemporaryVerMap {
for index, verInfo := range mp.multiVersionList.VerList {
if verInfo.Ver == verSeq {
log.LogInfof("checkVerList.updateVerList volname [%v] mp[%v] ver info %v be consider as TemporaryVer and do deletion verlist %v",
mp.config.VolName, mp.config.PartitionId, verInfo, mp.multiVersionList.VerList)
if index == len(mp.multiVersionList.VerList)-1 {
log.LogInfof("checkVerList.updateVerList volname [%v] mp[%v] last ver info %v should not be consider as TemporaryVer and do deletion verlist %v",
mp.config.VolName, mp.config.PartitionId, verInfo, mp.multiVersionList.VerList)
return
} else {
mp.multiVersionList.VerList = append(mp.multiVersionList.VerList[:index], mp.multiVersionList.VerList[index+1:]...)
}
log.LogInfof("checkVerList.updateVerList volname [%v] mp[%v] verlist %v", mp.config.VolName, mp.config.PartitionId, mp.multiVersionList.VerList)
break
}
}
}
return
}
func (mp *metaPartition) checkVerList(reqVerListInfo *proto.VolVersionInfoList, sync bool) (needUpdate bool, err error) {
mp.multiVersionList.RWLock.RLock()
verMapLocal := make(map[uint64]*proto.VolVersionInfo)
verMapReq := make(map[uint64]*proto.VolVersionInfo)
for _, ver := range reqVerListInfo.VerList {
verMapReq[ver.Ver] = ver
}
var VerList []*proto.VolVersionInfo
for _, info2 := range mp.multiVersionList.VerList {
log.LogDebugf("checkVerList. volname [%v] mp[%v] ver info %v", mp.config.VolName, mp.config.PartitionId, info2)
vms, exist := verMapReq[info2.Ver]
if !exist {
log.LogWarnf("checkVerList. volname [%v] mp[%v] version info(%v) not exist in master (%v)",
mp.config.VolName, mp.config.PartitionId, info2, reqVerListInfo.VerList)
} else if info2.Status != proto.VersionNormal && info2.Status != vms.Status {
log.LogWarnf("checkVerList. volname [%v] mp[%v] ver [%v] status abnormal %v", mp.config.VolName, mp.config.PartitionId, info2.Ver, info2.Status)
info2.Status = vms.Status
needUpdate = true
}
if _, ok := verMapLocal[info2.Ver]; !ok {
verMapLocal[info2.Ver] = info2
VerList = append(VerList, info2)
}
}
mp.multiVersionList.RWLock.RUnlock()
for _, vInfo := range reqVerListInfo.VerList {
if vInfo.Status != proto.VersionNormal && vInfo.Status != proto.VersionPrepare {
log.LogDebugf("checkVerList. volname [%v] mp[%v] master info %v", mp.config.VolName, mp.config.PartitionId, vInfo)
continue
}
ver, exist := verMapLocal[vInfo.Ver]
if !exist {
expStr := fmt.Sprintf("checkVerList.volname [%v] mp[%v] not found %v in mp list and append version %v",
mp.config.VolName, mp.config.PartitionId, vInfo.Ver, vInfo)
log.LogWarnf("[checkVerList] volname [%v]", expStr)
if vInfo.Ver < mp.multiVersionList.GetLastVer() {
continue
}
exporter.Warning(expStr)
VerList = append(VerList, vInfo)
needUpdate = true
verMapLocal[vInfo.Ver] = vInfo
continue
}
if ver.Status != vInfo.Status {
warn := fmt.Sprintf("checkVerList.volname [%v] mp[%v] ver [%v] inoraml.local status [%v] update to %v",
mp.config.VolName, mp.config.PartitionId, vInfo.Status, vInfo.Ver, vInfo.Status)
log.LogWarn(warn)
ver.Status = vInfo.Status
}
}
if needUpdate {
var lastSeq uint64
sort.SliceStable(VerList, func(i, j int) bool {
if VerList[i].Ver < VerList[j].Ver {
lastSeq = VerList[j].Ver
return true
}
lastSeq = VerList[i].Ver
return false
})
if err = mp.HandleVersionOp(proto.SyncBatchVersionList, lastSeq, VerList, sync); err != nil {
return
}
}
return
}
func (mp *metaPartition) HandleVersionOp(op uint8, verSeq uint64, verList []*proto.VolVersionInfo, sync bool) (err error) {
verData := &VerOpData{
Op: op,
VerSeq: verSeq,
VerList: verList,
}
data, _ := json.Marshal(verData)
if sync {
_, err = mp.submit(opFSMVersionOp, data)
return
}
select {
case mp.verUpdateChan <- data:
log.LogDebugf("mp[%v] verseq [%v] op [%v] be pushed to queue", mp.config.PartitionId, verSeq, op)
default:
err = fmt.Errorf("mp[%v] version update channel full, verdata %v not be executed", mp.config.PartitionId, string(data))
}
return
}
func (mp *metaPartition) GetAllVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error) {
return
}
func (mp *metaPartition) GetSpecVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error) {
return
}
func (mp *metaPartition) GetExtentByVer(ino *Inode, req *proto.GetExtentsRequest, rsp *proto.GetExtentsResponse) {
log.LogInfof("action[GetExtentByVer] read ino[%v] readseq [%v] ino seq [%v] hist len %v", ino.Inode, req.VerSeq, ino.getVer(), ino.getLayerLen())
reqVer := req.VerSeq
if isInitSnapVer(req.VerSeq) {
reqVer = 0
}
ino.DoReadFunc(func() {
ino.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
if ek.GetSeq() <= reqVer {
rsp.Extents = append(rsp.Extents, ek)
log.LogInfof("action[GetExtentByVer] fresh layer.read ino[%v] readseq [%v] ino seq [%v] include ek [%v]", ino.Inode, reqVer, ino.getVer(), ek)
} else {
log.LogInfof("action[GetExtentByVer] fresh layer.read ino[%v] readseq [%v] ino seq [%v] exclude ek [%v]", ino.Inode, reqVer, ino.getVer(), ek)
}
return true
})
ino.RangeMultiVer(func(idx int, snapIno *Inode) bool {
log.LogInfof("action[GetExtentByVer] read ino[%v] readseq [%v] snapIno ino seq [%v]", ino.Inode, reqVer, snapIno.getVer())
for _, ek := range snapIno.Extents.eks {
if reqVer >= ek.GetSeq() {
log.LogInfof("action[GetExtentByVer] get extent ino[%v] readseq [%v] snapIno ino seq [%v], include ek (%v)", ino.Inode, reqVer, snapIno.getVer(), ek.String())
rsp.Extents = append(rsp.Extents, ek)
} else {
log.LogInfof("action[GetExtentByVer] not get extent ino[%v] readseq [%v] snapIno ino seq [%v], exclude ek (%v)", ino.Inode, reqVer, snapIno.getVer(), ek.String())
}
}
if reqVer >= snapIno.getVer() {
log.LogInfof("action[GetExtentByVer] finish read ino[%v] readseq [%v] snapIno ino seq [%v]", ino.Inode, reqVer, snapIno.getVer())
return false
}
return true
})
sort.SliceStable(rsp.Extents, func(i, j int) bool {
return rsp.Extents[i].FileOffset < rsp.Extents[j].FileOffset
})
})
return
}
func (mp *metaPartition) SetUidLimit(info []*proto.UidSpaceInfo) {
mp.uidManager.volName = mp.config.VolName
mp.uidManager.setUidAcl(info)
}
func (mp *metaPartition) GetUidInfo() (info []*proto.UidReportSpaceInfo) {
return mp.uidManager.getAllUidSpace()
}
// ExtentsList returns the list of extents.
func (mp *metaPartition) ExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error) {
log.LogDebugf("action[ExtentsList] inode[%v] verseq [%v]", req.Inode, req.VerSeq)
// note:don't need set reqSeq, extents get be done in next step
ino := NewInode(req.Inode, 0)
retMsg := mp.getInodeTopLayer(ino)
// notice.getInode should not set verSeq due to extent need filter from the newest layer to req.VerSeq
ino = retMsg.Msg
var (
reply []byte
status = retMsg.Status
)
if status == proto.OpOk {
resp := &proto.GetExtentsResponse{}
log.LogInfof("action[ExtentsList] inode[%v] request verseq [%v] ino ver [%v] extent size %v ino.Size %v ino[%v] hist len %v",
req.Inode, req.VerSeq, ino.getVer(), len(ino.Extents.eks), ino.Size, ino, ino.getLayerLen())
if req.VerSeq > 0 && ino.getVer() > 0 && (req.VerSeq < ino.getVer() || isInitSnapVer(req.VerSeq)) {
mp.GetExtentByVer(ino, req, resp)
vIno := ino.Copy().(*Inode)
vIno.setVerNoCheck(req.VerSeq)
if vIno = mp.getInodeByVer(vIno); vIno != nil {
resp.Generation = vIno.Generation
resp.Size = vIno.Size
}
} else {
ino.DoReadFunc(func() {
resp.Generation = ino.Generation
resp.Size = ino.Size
ino.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
resp.Extents = append(resp.Extents, ek)
log.LogInfof("action[ExtentsList] append ek [%v]", ek)
return true
})
})
}
if req.VerAll {
resp.LayerInfo = retMsg.Msg.getAllLayerEks()
}
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return
}
// ObjExtentsList returns the list of obj extents and extents.
func (mp *metaPartition) ObjExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error) {
ino := NewInode(req.Inode, 0)
ino.setVer(req.VerSeq)
retMsg := mp.getInode(ino, false)
ino = retMsg.Msg
var (
reply []byte
status = retMsg.Status
)
if status == proto.OpOk {
resp := &proto.GetObjExtentsResponse{}
ino.DoReadFunc(func() {
resp.Generation = ino.Generation
resp.Size = ino.Size
ino.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
resp.Extents = append(resp.Extents, ek)
return true
})
ino.ObjExtents.Range(func(ek proto.ObjExtentKey) bool {
resp.ObjExtents = append(resp.ObjExtents, ek)
return true
})
})
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return
}
// ExtentsTruncate truncates an extent.
func (mp *metaPartition) ExtentsTruncate(req *ExtentsTruncateReq, p *Packet, remoteAddr string) (err error) {
if !proto.IsHot(mp.volType) {
err = fmt.Errorf("only support hot vol")
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
fileSize := uint64(0)
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, fileSize)
}()
}
ino := NewInode(req.Inode, proto.Mode(os.ModePerm))
item := mp.inodeTree.CopyGet(ino)
if item == nil {
err = fmt.Errorf("inode[%v] is not exist", req.Inode)
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
i := item.(*Inode)
status := mp.isOverQuota(req.Inode, req.Size > i.Size, false)
if status != 0 {
log.LogErrorf("ExtentsTruncate fail status [%v]", status)
err = errors.New("ExtentsTruncate is over quota")
reply := []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
ino.Size = req.Size
fileSize = ino.Size
ino.setVer(mp.verSeq)
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMExtentTruncate, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
msg := resp.(*InodeResponse)
p.PacketErrorWithBody(msg.Status, nil)
return
}
func (mp *metaPartition) BatchExtentAppend(req *proto.AppendExtentKeysRequest, p *Packet) (err error) {
if !proto.IsHot(mp.volType) {
err = fmt.Errorf("only support hot vol")
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
var ino *Inode
if ino, _, err = mp.CheckQuota(req.Inode, p); err != nil {
log.LogErrorf("BatchExtentAppend fail err [%v]", err)
return
}
extents := req.Extents
for _, extent := range extents {
ino.Extents.Append(extent)
}
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMExtentsAdd, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketErrorWithBody(resp.(uint8), nil)
return
}
func (mp *metaPartition) BatchObjExtentAppend(req *proto.AppendObjExtentKeysRequest, p *Packet) (err error) {
var ino *Inode
if ino, _, err = mp.CheckQuota(req.Inode, p); err != nil {
log.LogErrorf("BatchObjExtentAppend fail status [%v]", err)
return
}
objExtents := req.Extents
for _, objExtent := range objExtents {
err = ino.ObjExtents.Append(objExtent)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
}
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMObjExtentsAdd, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketErrorWithBody(resp.(uint8), nil)
return
}
// func (mp *metaPartition) ExtentsDelete(req *proto.DelExtentKeyRequest, p *Packet) (err error) {
// ino := NewInode(req.Inode, 0)
// inode := mp.inodeTree.Get(ino).(*Inode)
// inode.Extents.Delete(req.Extents)
// curTime := timeutil.GetCurrentTimeUnix()
// if inode.ModifyTime < curTime {
// inode.ModifyTime = curTime
// }
// val, err := inode.Marshal()
// if err != nil {
// p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
// return
// }
// resp, err := mp.submit(opFSMExtentsDel, val)
// if err != nil {
// p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
// return
// }
// p.PacketErrorWithBody(resp.(uint8), nil)
// return
// }
// ExtentsEmpty only use in datalake situation
func (mp *metaPartition) ExtentsOp(p *Packet, ino *Inode, op uint32) (err error) {
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(op, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketErrorWithBody(resp.(uint8), nil)
return
}
func (mp *metaPartition) sendExtentsToChan(eks []proto.ExtentKey) (err error) {
if len(eks) == 0 {
return
}
sortExts := NewSortedExtentsFromEks(eks)
val, err := sortExts.MarshalBinary(true)
if err != nil {
return fmt.Errorf("[delExtents] marshal binary fail, %s", err.Error())
}
_, err = mp.submit(opFSMSentToChan, val)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/binary"
"encoding/json"
"fmt"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/auditlog"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
func replyInfoNoCheck(info *proto.InodeInfo, ino *Inode) bool {
ino.RLock()
defer ino.RUnlock()
info.Inode = ino.Inode
info.Mode = ino.Type
info.Size = ino.Size
info.Nlink = ino.NLink
info.Uid = ino.Uid
info.Gid = ino.Gid
info.Generation = ino.Generation
info.VerSeq = ino.getVer()
if length := len(ino.LinkTarget); length > 0 {
info.Target = make([]byte, length)
copy(info.Target, ino.LinkTarget)
}
info.CreateTime = time.Unix(ino.CreateTime, 0)
info.AccessTime = time.Unix(ino.AccessTime, 0)
info.ModifyTime = time.Unix(ino.ModifyTime, 0)
return true
}
func replyInfo(info *proto.InodeInfo, ino *Inode, quotaInfos map[uint32]*proto.MetaQuotaInfo) bool {
ino.RLock()
defer ino.RUnlock()
if ino.Flag&DeleteMarkFlag > 0 {
return false
}
info.Inode = ino.Inode
info.Mode = ino.Type
info.Size = ino.Size
info.Nlink = ino.NLink
info.Uid = ino.Uid
info.Gid = ino.Gid
info.Generation = ino.Generation
info.VerSeq = ino.getVer()
if length := len(ino.LinkTarget); length > 0 {
info.Target = make([]byte, length)
copy(info.Target, ino.LinkTarget)
}
info.CreateTime = time.Unix(ino.CreateTime, 0)
info.AccessTime = time.Unix(ino.AccessTime, 0)
info.ModifyTime = time.Unix(ino.ModifyTime, 0)
info.QuotaInfos = quotaInfos
return true
}
func txReplyInfo(inode *Inode, txInfo *proto.TransactionInfo, quotaInfos map[uint32]*proto.MetaQuotaInfo) (resp *proto.TxCreateInodeResponse) {
inoInfo := &proto.InodeInfo{
Inode: inode.Inode,
Mode: inode.Type,
Nlink: inode.NLink,
Size: inode.Size,
Uid: inode.Uid,
Gid: inode.Gid,
Generation: inode.Generation,
ModifyTime: time.Unix(inode.ModifyTime, 0),
CreateTime: time.Unix(inode.CreateTime, 0),
AccessTime: time.Unix(inode.AccessTime, 0),
QuotaInfos: quotaInfos,
Target: nil,
}
if length := len(inode.LinkTarget); length > 0 {
inoInfo.Target = make([]byte, length)
copy(inoInfo.Target, inode.LinkTarget)
}
resp = &proto.TxCreateInodeResponse{
Info: inoInfo,
TxInfo: txInfo,
}
return
}
// CreateInode returns a new inode.
func (mp *metaPartition) CreateInode(req *CreateInoReq, p *Packet, remoteAddr string) (err error) {
var (
status = proto.OpNotExistErr
reply []byte
resp interface{}
qinode *MetaQuotaInode
inoID uint64
)
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), inoID, 0)
}()
}
inoID, err = mp.nextInodeID()
if err != nil {
p.PacketErrorWithBody(proto.OpInodeFullErr, []byte(err.Error()))
return
}
ino := NewInode(inoID, req.Mode)
ino.Uid = req.Uid
ino.Gid = req.Gid
ino.setVer(mp.verSeq)
ino.LinkTarget = req.Target
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return err
}
resp, err = mp.submit(opFSMCreateInode, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
if resp.(uint8) == proto.OpOk {
resp := &CreateInoResp{
Info: &proto.InodeInfo{},
}
if replyInfo(resp.Info, ino, make(map[uint32]*proto.MetaQuotaInfo, 0)) {
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
}
p.PacketErrorWithBody(status, reply)
log.LogInfof("CreateInode req [%v] qinode[%v] success.", req, qinode)
return
}
func (mp *metaPartition) QuotaCreateInode(req *proto.QuotaCreateInodeRequest, p *Packet, remoteAddr string) (err error) {
var (
status = proto.OpNotExistErr
reply []byte
resp interface{}
qinode *MetaQuotaInode
inoID uint64
)
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), inoID, 0)
}()
}
inoID, err = mp.nextInodeID()
if err != nil {
p.PacketErrorWithBody(proto.OpInodeFullErr, []byte(err.Error()))
return
}
ino := NewInode(inoID, req.Mode)
ino.Uid = req.Uid
ino.Gid = req.Gid
ino.LinkTarget = req.Target
for _, quotaId := range req.QuotaIds {
status = mp.mqMgr.IsOverQuota(false, true, quotaId)
if status != 0 {
err = errors.New("create inode is over quota")
reply = []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
}
qinode = &MetaQuotaInode{
inode: ino,
quotaIds: req.QuotaIds,
}
val, err := qinode.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return err
}
resp, err = mp.submit(opFSMCreateInodeQuota, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
if resp.(uint8) == proto.OpOk {
resp := &CreateInoResp{
Info: &proto.InodeInfo{},
}
quotaInfos := make(map[uint32]*proto.MetaQuotaInfo)
for _, quotaId := range req.QuotaIds {
quotaInfos[quotaId] = &proto.MetaQuotaInfo{
RootInode: false,
}
}
if replyInfo(resp.Info, ino, quotaInfos) {
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
}
p.PacketErrorWithBody(status, reply)
log.LogInfof("QuotaCreateInode req [%v] qinode[%v] success.", req, qinode)
return
}
func (mp *metaPartition) TxUnlinkInode(req *proto.TxUnlinkInodeRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
txInfo := req.TxInfo.GetCopy()
var status uint8
var respIno *Inode
defer func() {
var reply []byte
if status == proto.OpOk {
resp := &proto.TxUnlinkInodeResponse{
Info: &proto.InodeInfo{},
}
if respIno != nil {
replyInfo(resp.Info, respIno, make(map[uint32]*proto.MetaQuotaInfo, 0))
if reply, err = json.Marshal(resp); err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
}
}()
ino := NewInode(req.Inode, 0)
inoResp := mp.getInode(ino, true)
if inoResp.Status != proto.OpOk {
if rbIno := mp.txInodeInRb(req.Inode, req.TxInfo.TxID); rbIno != nil {
respIno = rbIno.inode
status = proto.OpOk
item := mp.inodeTree.Get(NewInode(req.Inode, 0))
if item != nil {
respIno = item.(*Inode)
}
p.ResultCode = status
log.LogWarnf("TxUnlinkInode: inode is already unlink before, req %v, rbino[%v], item %v", req, respIno, item)
return nil
}
err = fmt.Errorf("ino[%v] not exists", ino.Inode)
p.PacketErrorWithBody(inoResp.Status, []byte(err.Error()))
return
}
respIno = inoResp.Msg
createTime := respIno.CreateTime
deleteLockTime := mp.vol.volDeleteLockTime * 60 * 60
if deleteLockTime > 0 && createTime+deleteLockTime > time.Now().Unix() {
err = fmt.Errorf("the current Inode[%v] is still locked for deletion", req.Inode)
log.LogDebugf("TxUnlinkInode: the current Inode is still locked for deletion, inode[%v] createTime(%v) mw.volDeleteLockTime(%v) now(%v)", respIno.Inode, createTime, deleteLockTime, time.Now())
p.PacketErrorWithBody(proto.OpNotPerm, []byte(err.Error()))
return
}
ti := &TxInode{
Inode: inoResp.Msg,
TxInfo: txInfo,
}
val, err := ti.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
r, err := mp.submit(opFSMTxUnlinkInode, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
msg := r.(*InodeResponse)
status = msg.Status
if msg.Msg != nil {
respIno = msg.Msg
}
p.ResultCode = status
return
}
// DeleteInode deletes an inode.
func (mp *metaPartition) UnlinkInode(req *UnlinkInoReq, p *Packet, remoteAddr string) (err error) {
var (
msg *InodeResponse
reply []byte
r interface{}
val []byte
)
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
makeRspFunc := func() {
status := msg.Status
if status == proto.OpOk {
resp := &UnlinkInoResp{
Info: &proto.InodeInfo{},
}
replyInfo(resp.Info, msg.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0))
if reply, err = json.Marshal(resp); err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
}
ino := NewInode(req.Inode, 0)
if item := mp.inodeTree.Get(ino); item == nil {
err = fmt.Errorf("mp[%v] inode[%v] reqeust cann't found", mp.config.PartitionId, ino)
log.LogErrorf("action[UnlinkInode] %v", err)
p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
return
}
if req.UniqID > 0 {
val = InodeOnceUnlinkMarshal(req)
r, err = mp.submit(opFSMUnlinkInodeOnce, val)
} else {
ino.setVer(req.VerSeq)
log.LogDebugf("action[UnlinkInode] mp[%v] verseq [%v] ino[%v]", mp.config.PartitionId, req.VerSeq, ino)
val, err = ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
log.LogDebugf("action[UnlinkInode] mp[%v] ino[%v] submit", mp.config.PartitionId, ino)
r, err = mp.submit(opFSMUnlinkInode, val)
}
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
msg = r.(*InodeResponse)
makeRspFunc()
return
}
// DeleteInode deletes an inode.
func (mp *metaPartition) UnlinkInodeBatch(req *BatchUnlinkInoReq, p *Packet, remoteAddr string) (err error) {
if len(req.Inodes) == 0 {
return nil
}
var inodes InodeBatch
start := time.Now()
for i, id := range req.Inodes {
inodes = append(inodes, NewInode(id, 0))
ino := id
fullPath := ""
if len(req.FullPaths) > i {
fullPath = req.FullPaths[i]
}
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), fullPath, err, time.Since(start).Milliseconds(), ino, 0)
}()
}
}
val, err := inodes.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
r, err := mp.submit(opFSMUnlinkInodeBatch, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
result := &BatchUnlinkInoResp{}
status := proto.OpOk
for _, ir := range r.([]*InodeResponse) {
if ir.Status != proto.OpOk {
status = ir.Status
}
info := &proto.InodeInfo{}
replyInfo(info, ir.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0))
result.Items = append(result.Items, &struct {
Info *proto.InodeInfo `json:"info"`
Status uint8 `json:"status"`
}{
Info: info,
Status: ir.Status,
})
}
reply, err := json.Marshal(result)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
p.PacketErrorWithBody(status, reply)
return
}
// InodeGet executes the inodeGet command from the client.
func (mp *metaPartition) InodeGetSplitEk(req *InodeGetSplitReq, p *Packet) (err error) {
ino := NewInode(req.Inode, 0)
ino.setVer(req.VerSeq)
getAllVerInfo := req.VerAll
retMsg := mp.getInode(ino, getAllVerInfo)
log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
ino = retMsg.Msg
var (
reply []byte
status = proto.OpNotExistErr
)
if retMsg.Status == proto.OpOk {
resp := &proto.InodeGetSplitResponse{
Info: &proto.InodeSplitInfo{
Inode: ino.Inode,
VerSeq: ino.getVer(),
},
}
multiSnap := retMsg.Msg.multiSnap
if multiSnap != nil && multiSnap.ekRefMap != nil {
multiSnap.ekRefMap.Range(func(key, value interface{}) bool {
dpID, extID := proto.ParseFromId(key.(uint64))
resp.Info.SplitArr = append(resp.Info.SplitArr, proto.SimpleExtInfo{
ID: key.(uint64),
PartitionID: uint32(dpID),
ExtentID: uint32(extID),
})
return true
})
}
log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
status = proto.OpErr
reply = []byte(err.Error())
}
log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
}
log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
p.PacketErrorWithBody(status, reply)
return
}
// InodeGet executes the inodeGet command from the client.
func (mp *metaPartition) InodeGet(req *InodeGetReq, p *Packet) (err error) {
ino := NewInode(req.Inode, 0)
ino.setVer(req.VerSeq)
getAllVerInfo := req.VerAll
retMsg := mp.getInode(ino, getAllVerInfo)
log.LogDebugf("action[Inode] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
ino = retMsg.Msg
var (
reply []byte
status = proto.OpNotExistErr
quotaInfos map[uint32]*proto.MetaQuotaInfo
)
if mp.mqMgr.EnableQuota() {
quotaInfos, err = mp.getInodeQuotaInfos(req.Inode)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
}
ino = retMsg.Msg
if retMsg.Status == proto.OpOk {
resp := &proto.InodeGetResponse{
Info: &proto.InodeInfo{},
}
if getAllVerInfo {
replyInfoNoCheck(resp.Info, retMsg.Msg)
} else {
if !replyInfo(resp.Info, retMsg.Msg, quotaInfos) {
p.PacketErrorWithBody(status, reply)
return
}
}
status = proto.OpOk
if getAllVerInfo {
inode := mp.getInodeTopLayer(ino)
log.LogDebugf("req ino[%v], toplayer ino[%v]", retMsg.Msg, inode)
resp.LayAll = inode.Msg.getAllInodesInfo()
}
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return
}
// InodeGetBatch executes the inodeBatchGet command from the client.
func (mp *metaPartition) InodeGetBatch(req *InodeGetReqBatch, p *Packet) (err error) {
resp := &proto.BatchInodeGetResponse{}
ino := NewInode(0, 0)
for _, inoId := range req.Inodes {
var quotaInfos map[uint32]*proto.MetaQuotaInfo
ino.Inode = inoId
ino.setVer(req.VerSeq)
retMsg := mp.getInode(ino, false)
if mp.mqMgr.EnableQuota() {
quotaInfos, err = mp.getInodeQuotaInfos(inoId)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
}
if retMsg.Status == proto.OpOk {
inoInfo := &proto.InodeInfo{}
if replyInfo(inoInfo, retMsg.Msg, quotaInfos) {
resp.Infos = append(resp.Infos, inoInfo)
}
}
}
data, err := json.Marshal(resp)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(data)
return
}
func (mp *metaPartition) TxCreateInodeLink(req *proto.TxLinkInodeRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
txInfo := req.TxInfo.GetCopy()
ino := NewInode(req.Inode, 0)
inoResp := mp.getInode(ino, true)
if inoResp.Status != proto.OpOk {
err = fmt.Errorf("ino[%v] not exists", ino.Inode)
p.PacketErrorWithBody(inoResp.Status, []byte(err.Error()))
return
}
ti := &TxInode{
Inode: inoResp.Msg,
TxInfo: txInfo,
}
val, err := ti.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMTxCreateLinkInode, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
retMsg := resp.(*InodeResponse)
status := retMsg.Status
var reply []byte
if retMsg.Status == proto.OpOk {
resp := &proto.TxLinkInodeResponse{
Info: &proto.InodeInfo{},
}
if replyInfo(resp.Info, retMsg.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0)) {
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
}
p.PacketErrorWithBody(status, reply)
return
}
// CreateInodeLink creates an inode link (e.g., soft link).
func (mp *metaPartition) CreateInodeLink(req *LinkInodeReq, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
var r interface{}
var val []byte
if req.UniqID > 0 {
val = InodeOnceLinkMarshal(req)
r, err = mp.submit(opFSMCreateLinkInodeOnce, val)
} else {
ino := NewInode(req.Inode, 0)
ino.setVer(mp.verSeq)
val, err = ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
r, err = mp.submit(opFSMCreateLinkInode, val)
}
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
retMsg := r.(*InodeResponse)
status := proto.OpNotExistErr
var reply []byte
if retMsg.Status == proto.OpOk {
resp := &LinkInodeResp{
Info: &proto.InodeInfo{},
}
if replyInfo(resp.Info, retMsg.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0)) {
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
}
p.PacketErrorWithBody(status, reply)
return
}
// EvictInode evicts an inode.
func (mp *metaPartition) EvictInode(req *EvictInodeReq, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
ino := NewInode(req.Inode, 0)
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMEvictInode, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
msg := resp.(*InodeResponse)
p.PacketErrorWithBody(msg.Status, nil)
return
}
// EvictInode evicts an inode.
func (mp *metaPartition) EvictInodeBatch(req *BatchEvictInodeReq, p *Packet, remoteAddr string) (err error) {
if len(req.Inodes) == 0 {
return nil
}
start := time.Now()
var inodes InodeBatch
for i, id := range req.Inodes {
inodes = append(inodes, NewInode(id, 0))
ino := id
fullPath := ""
if len(req.FullPaths) > i {
fullPath = req.FullPaths[i]
}
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), fullPath, err, time.Since(start).Milliseconds(), ino, 0)
}()
}
}
val, err := inodes.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMEvictInodeBatch, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
status := proto.OpOk
for _, m := range resp.([]*InodeResponse) {
if m.Status != proto.OpOk {
status = m.Status
}
}
p.PacketErrorWithBody(status, nil)
return
}
// SetAttr set the inode attributes.
func (mp *metaPartition) SetAttr(req *SetattrRequest, reqData []byte, p *Packet) (err error) {
if mp.verSeq != 0 {
req.VerSeq = mp.GetVerSeq()
reqData, err = json.Marshal(req)
if err != nil {
log.LogErrorf("setattr: marshal err(%v)", err)
return
}
}
_, err = mp.submit(opFSMSetAttr, reqData)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
log.LogDebugf("action[SetAttr] inode[%v] ver [%v] exit", req.Inode, req.VerSeq)
p.PacketOkReply()
return
}
// GetInodeTree returns the inode tree.
func (mp *metaPartition) GetInodeTree() *BTree {
return mp.inodeTree.GetTree()
}
// GetInodeTreeLen returns the inode tree length.
func (mp *metaPartition) GetInodeTreeLen() int {
if mp.inodeTree == nil {
return 0
}
return mp.inodeTree.Len()
}
func (mp *metaPartition) DeleteInode(req *proto.DeleteInodeRequest, p *Packet, remoteAddr string) (err error) {
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
}()
}
bytes := make([]byte, 8)
binary.BigEndian.PutUint64(bytes, req.Inode)
_, err = mp.submit(opFSMInternalDeleteInode, bytes)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
func (mp *metaPartition) DeleteInodeBatch(req *proto.DeleteInodeBatchRequest, p *Packet, remoteAddr string) (err error) {
if len(req.Inodes) == 0 {
return nil
}
start := time.Now()
var inodes InodeBatch
for i, id := range req.Inodes {
inodes = append(inodes, NewInode(id, 0))
ino := id
fullPath := ""
if len(req.FullPaths) > i {
fullPath = req.FullPaths[i]
}
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), fullPath, err, time.Since(start).Milliseconds(), ino, 0)
}()
}
}
encoded, err := inodes.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
_, err = mp.submit(opFSMInternalDeleteInodeBatch, encoded)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketOkReply()
return
}
// ClearInodeCache clear a inode's cbfs extent but keep ebs extent.
func (mp *metaPartition) ClearInodeCache(req *proto.ClearInodeCacheRequest, p *Packet) (err error) {
if len(mp.extDelCh) > defaultDelExtentsCnt-100 {
err = fmt.Errorf("extent del chan full")
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
ino := NewInode(req.Inode, 0)
val, err := ino.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp, err := mp.submit(opFSMClearInodeCache, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
p.PacketErrorWithBody(resp.(uint8), nil)
return
}
// TxCreateInode returns a new inode.
func (mp *metaPartition) TxCreateInode(req *proto.TxCreateInodeRequest, p *Packet, remoteAddr string) (err error) {
var (
status = proto.OpNotExistErr
reply []byte
resp interface{}
inoID uint64
)
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), inoID, 0)
}()
}
inoID, err = mp.nextInodeID()
if err != nil {
p.PacketErrorWithBody(proto.OpInodeFullErr, []byte(err.Error()))
return
}
req.TxInfo.SetCreateInodeId(inoID)
createTxReq := &proto.TxCreateRequest{
VolName: req.VolName,
PartitionID: req.PartitionID,
TransactionInfo: req.TxInfo,
}
err = mp.TxCreate(createTxReq, p)
if err != nil || p.ResultCode != proto.OpOk {
return
}
createResp := &proto.TxCreateResponse{}
err = json.Unmarshal(p.Data, createResp)
if err != nil || createResp.TxInfo == nil {
err = fmt.Errorf("TxCreateInode: unmarshal txInfo failed, data %s, err %v", string(p.Data), err)
log.LogWarn(err)
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
txIno := NewTxInode(inoID, req.Mode, createResp.TxInfo)
txIno.Inode.Uid = req.Uid
txIno.Inode.Gid = req.Gid
txIno.Inode.LinkTarget = req.Target
if log.EnableDebug() {
log.LogDebugf("NewTxInode: TxInode: %v", txIno)
}
if defaultQuotaSwitch {
for _, quotaId := range req.QuotaIds {
status = mp.mqMgr.IsOverQuota(false, true, quotaId)
if status != 0 {
err = errors.New("tx create inode is over quota")
reply = []byte(err.Error())
p.PacketErrorWithBody(status, reply)
return
}
}
qinode := &TxMetaQuotaInode{
txinode: txIno,
quotaIds: req.QuotaIds,
}
val, err := qinode.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return err
}
resp, err = mp.submit(opFSMTxCreateInodeQuota, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
} else {
val, err := txIno.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return err
}
resp, err = mp.submit(opFSMTxCreateInode, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
}
if resp == proto.OpOk {
quotaInfos := make(map[uint32]*proto.MetaQuotaInfo)
for _, quotaId := range req.QuotaIds {
quotaInfos[quotaId] = &proto.MetaQuotaInfo{
RootInode: false,
}
}
resp := txReplyInfo(txIno.Inode, createResp.TxInfo, quotaInfos)
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
)
func (mp *metaPartition) GetExpiredMultipart(req *proto.GetExpiredMultipartRequest, p *Packet) (err error) {
expiredMultiPartInfos := make([]*proto.ExpiredMultipartInfo, 0)
walkTreeFunc := func(i BtreeItem) bool {
multipart := i.(*Multipart)
if len(req.Prefix) > 0 && !strings.HasPrefix(multipart.key, req.Prefix) {
// skip and continue
return true
}
if multipart.initTime.Unix()+int64(req.Days*24*60*60) <= time.Now().Local().Unix() {
info := &proto.ExpiredMultipartInfo{
Path: multipart.key,
MultipartId: multipart.id,
Inodes: make([]uint64, 0),
}
for _, part := range multipart.Parts() {
info.Inodes = append(info.Inodes, part.Inode)
}
expiredMultiPartInfos = append(expiredMultiPartInfos, info)
}
return true
}
mp.multipartTree.Ascend(walkTreeFunc)
resp := &proto.GetExpiredMultipartResponse{
Infos: expiredMultiPartInfos,
}
var reply []byte
if reply, err = json.Marshal(resp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
func (mp *metaPartition) GetMultipart(req *proto.GetMultipartRequest, p *Packet) (err error) {
item := mp.multipartTree.Get(&Multipart{key: req.Path, id: req.MultipartId})
if item == nil {
p.PacketErrorWithBody(proto.OpNotExistErr, nil)
return
}
multipart := item.(*Multipart)
resp := &proto.GetMultipartResponse{
Info: &proto.MultipartInfo{
ID: multipart.id,
Path: multipart.key,
InitTime: multipart.initTime,
Parts: make([]*proto.MultipartPartInfo, 0, len(multipart.parts)),
Extend: multipart.extend,
},
}
for _, part := range multipart.Parts() {
resp.Info.Parts = append(resp.Info.Parts, &proto.MultipartPartInfo{
ID: part.ID,
Inode: part.Inode,
MD5: part.MD5,
Size: part.Size,
UploadTime: part.UploadTime,
})
}
var reply []byte
if reply, err = json.Marshal(resp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
func (mp *metaPartition) AppendMultipart(req *proto.AddMultipartPartRequest, p *Packet) (err error) {
if req.Part == nil {
p.PacketOkReply()
return
}
item := mp.multipartTree.Get(&Multipart{key: req.Path, id: req.MultipartId})
if item == nil {
p.PacketErrorWithBody(proto.OpNotExistErr, nil)
return
}
multipart := &Multipart{
id: req.MultipartId,
key: req.Path,
parts: Parts{
&Part{
ID: req.Part.ID,
UploadTime: req.Part.UploadTime,
MD5: req.Part.MD5,
Size: req.Part.Size,
Inode: req.Part.Inode,
},
},
}
var resp interface{}
if resp, err = mp.putMultipart(opFSMAppendMultipart, multipart); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
appendMultipartResp := resp.(proto.AppendMultipartResponse)
if appendMultipartResp.Status != proto.OpOk {
p.PacketErrorWithBody(appendMultipartResp.Status, nil)
return
}
var reply []byte
if reply, err = json.Marshal(appendMultipartResp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
func (mp *metaPartition) RemoveMultipart(req *proto.RemoveMultipartRequest, p *Packet) (err error) {
multipart := &Multipart{
id: req.MultipartId,
key: req.Path,
}
var resp interface{}
if resp, err = mp.putMultipart(opFSMRemoveMultipart, multipart); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
status := resp.(uint8)
if status != proto.OpOk {
p.PacketErrorWithBody(status, nil)
return
}
p.PacketOkReply()
return
}
func (mp *metaPartition) CreateMultipart(req *proto.CreateMultipartRequest, p *Packet) (err error) {
var multipartId string
for {
multipartId = util.CreateMultipartID(mp.config.PartitionId).String()
storedItem := mp.multipartTree.Get(&Multipart{key: req.Path, id: multipartId})
if storedItem == nil {
break
}
}
multipart := &Multipart{
id: multipartId,
key: req.Path,
initTime: time.Now().Local(),
extend: req.Extend,
}
if _, err = mp.putMultipart(opFSMCreateMultipart, multipart); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
resp := &proto.CreateMultipartResponse{
Info: &proto.MultipartInfo{
ID: multipartId,
Path: req.Path,
},
}
var reply []byte
if reply, err = json.Marshal(resp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
func (mp *metaPartition) ListMultipart(req *proto.ListMultipartRequest, p *Packet) (err error) {
max := int(req.Max)
keyMarker := req.Marker
multipartIdMarker := req.MultipartIdMarker
prefix := req.Prefix
matches := make([]*Multipart, 0, max)
walkTreeFunc := func(i BtreeItem) bool {
multipart := i.(*Multipart)
// prefix is enabled
if len(prefix) > 0 && !strings.HasPrefix(multipart.key, prefix) {
// skip and continue
return true
}
matches = append(matches, multipart)
return !(len(matches) >= max)
}
if len(keyMarker) > 0 {
mp.multipartTree.AscendGreaterOrEqual(&Multipart{key: keyMarker, id: multipartIdMarker}, walkTreeFunc)
} else {
mp.multipartTree.Ascend(walkTreeFunc)
}
multipartInfos := make([]*proto.MultipartInfo, len(matches))
convertPartFunc := func(part *Part) *proto.MultipartPartInfo {
return &proto.MultipartPartInfo{
ID: part.ID,
Inode: part.Inode,
MD5: part.MD5,
Size: part.Size,
UploadTime: part.UploadTime,
}
}
convertMultipartFunc := func(multipart *Multipart) *proto.MultipartInfo {
partInfos := make([]*proto.MultipartPartInfo, len(multipart.parts))
for i := 0; i < len(multipart.parts); i++ {
partInfos[i] = convertPartFunc(multipart.parts[i])
}
return &proto.MultipartInfo{
ID: multipart.id,
Path: multipart.key,
InitTime: multipart.initTime,
Parts: partInfos,
}
}
for i := 0; i < len(matches); i++ {
multipartInfos[i] = convertMultipartFunc(matches[i])
}
resp := &proto.ListMultipartResponse{
Multiparts: multipartInfos,
}
var reply []byte
if reply, err = json.Marshal(resp); err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(reply)
return
}
// SendMultipart replicate specified multipart operation to raft.
func (mp *metaPartition) putMultipart(op uint32, multipart *Multipart) (resp interface{}, err error) {
var encoded []byte
if encoded, err = multipart.Bytes(); err != nil {
return
}
resp, err = mp.submit(op, encoded)
return
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/json"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) batchSetInodeQuota(req *proto.BatchSetMetaserverQuotaReuqest,
resp *proto.BatchSetMetaserverQuotaResponse) (err error) {
if len(req.Inodes) == 0 {
return nil
}
val, err := json.Marshal(req)
if err != nil {
log.LogErrorf("batchSetInodeQuota marshal req [%v] failed [%v]", req, err)
return
}
r, err := mp.submit(opFSMSetInodeQuotaBatch, val)
if err != nil {
log.LogErrorf("batchSetInodeQuota submit req [%v] failed [%v]", req, err)
return
}
resp.InodeRes = r.(*proto.BatchSetMetaserverQuotaResponse).InodeRes
log.LogInfof("batchSetInodeQuota quotaId [%v] mp[%v] btreeLen [%v] resp [%v] success", req.QuotaId, mp.config.PartitionId,
mp.extendTree.Len(), resp)
return
}
func (mp *metaPartition) batchDeleteInodeQuota(req *proto.BatchDeleteMetaserverQuotaReuqest,
resp *proto.BatchDeleteMetaserverQuotaResponse) (err error) {
if len(req.Inodes) == 0 {
return nil
}
val, err := json.Marshal(req)
if err != nil {
log.LogErrorf("batchDeleteInodeQuota marshal req [%v] failed [%v]", req, err)
return
}
r, err := mp.submit(opFSMDeleteInodeQuotaBatch, val)
if err != nil {
log.LogErrorf("batchDeleteInodeQuota submit req [%v] failed [%v]", req, err)
return
}
resp.InodeRes = r.(*proto.BatchDeleteMetaserverQuotaResponse).InodeRes
log.LogInfof("batchSetInodeQuota quotaId [%v] mp[%v] btreeLen [%v] resp [%v] success", req.QuotaId, mp.config.PartitionId,
mp.extendTree.Len(), resp)
return
}
func (mp *metaPartition) setQuotaHbInfo(infos []*proto.QuotaHeartBeatInfo) {
mp.mqMgr.setQuotaHbInfo(infos)
return
}
func (mp *metaPartition) getQuotaReportInfos() (infos []*proto.QuotaReportInfo) {
return mp.mqMgr.getQuotaReportInfos()
}
func (mp *metaPartition) statisticExtendByLoad(extend *Extend) {
mqMgr := mp.mqMgr
ino := NewInode(extend.GetInode(), 0)
retMsg := mp.getInode(ino, true)
if retMsg.Status != proto.OpOk {
log.LogErrorf("statisticExtendByLoad get inode[%v] fail [%v].", extend.GetInode(), retMsg.Status)
return
}
ino = retMsg.Msg
if ino.NLink == 0 {
return
}
quotaIds, isFind := mp.isExistQuota(extend.GetInode())
if isFind {
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
for _, quotaId := range quotaIds {
var baseInfo proto.QuotaUsedInfo
value, isFind := mqMgr.statisticBase.Load(quotaId)
if isFind {
baseInfo = value.(proto.QuotaUsedInfo)
}
baseInfo.UsedBytes += int64(ino.Size)
baseInfo.UsedFiles += 1
mqMgr.statisticBase.Store(quotaId, baseInfo)
log.LogDebugf("[statisticExtendByLoad] quotaId [%v] baseInfo [%v]", quotaId, baseInfo)
}
}
log.LogInfof("statisticExtendByLoad ino[%v] isFind [%v].", ino.Inode, isFind)
return
}
func (mp *metaPartition) statisticExtendByStore(extend *Extend, inodeTree *BTree) {
mqMgr := mp.mqMgr
ino := NewInode(extend.GetInode(), 0)
retMsg := mp.getInode(ino, true)
if retMsg.Status != proto.OpOk {
log.LogErrorf("statisticExtendByStore get inode[%v] fail [%v].", extend.GetInode(), retMsg.Status)
return
}
ino = retMsg.Msg
if ino.NLink == 0 {
return
}
value, exist := extend.Get([]byte(proto.QuotaKey))
if !exist {
log.LogDebugf("statisticExtendByStore get quota key failed, mp[%v] inode[%v]", mp.config.PartitionId, extend.GetInode())
return
}
quotaInfos := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
if err := json.Unmarshal(value, "aInfos.QuotaInfoMap); err != nil {
log.LogErrorf("statisticExtendByStore inode[%v] Unmarshal quotaInfos fail [%v]", extend.GetInode(), err)
return
}
mqMgr.rwlock.Lock()
defer mqMgr.rwlock.Unlock()
for quotaId := range quotaInfos.QuotaInfoMap {
var baseInfo proto.QuotaUsedInfo
value, isFind := mqMgr.statisticRebuildBase.Load(quotaId)
if isFind {
baseInfo = value.(proto.QuotaUsedInfo)
}
baseInfo.UsedBytes += int64(ino.Size)
baseInfo.UsedFiles += 1
mqMgr.statisticRebuildBase.Store(quotaId, baseInfo)
log.LogDebugf("[statisticExtendByStore] mp[%v] quotaId [%v] inode[%v] baseInfo [%v]",
mp.config.PartitionId, quotaId, extend.GetInode(), baseInfo)
}
log.LogDebugf("statisticExtendByStore mp[%v] inode[%v] success.", mp.config.PartitionId, extend.GetInode())
return
}
func (mp *metaPartition) updateUsedInfo(size int64, files int64, ino uint64) {
quotaIds, isFind := mp.isExistQuota(ino)
if isFind {
log.LogInfof("updateUsedInfo ino[%v] quotaIds [%v] size [%v] files [%v]", ino, quotaIds, size, files)
for _, quotaId := range quotaIds {
mp.mqMgr.updateUsedInfo(size, files, quotaId)
}
}
return
}
func (mp *metaPartition) isExistQuota(ino uint64) (quotaIds []uint32, isFind bool) {
extend := NewExtend(ino)
treeItem := mp.extendTree.Get(extend)
if treeItem == nil {
isFind = false
return
}
extend = treeItem.(*Extend)
value, exist := extend.Get([]byte(proto.QuotaKey))
if !exist {
isFind = false
return
}
quotaInfos := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
if err := json.Unmarshal(value, "aInfos.QuotaInfoMap); err != nil {
log.LogErrorf("set quota inode[%v] Unmarshal quotaInfos fail [%v]", ino, err)
isFind = false
return
}
isFind = true
quotaInfos.RLock()
for quotaId := range quotaInfos.QuotaInfoMap {
quotaIds = append(quotaIds, quotaId)
}
quotaInfos.RUnlock()
log.LogInfof("isExistQuota inode:[%v] quotaIds [%v] isFind[%v]", ino, quotaIds, isFind)
return
}
func (mp *metaPartition) isOverQuota(ino uint64, size bool, files bool) (status uint8) {
quotaIds, isFind := mp.isExistQuota(ino)
if isFind {
for _, quotaId := range quotaIds {
status = mp.mqMgr.IsOverQuota(size, files, quotaId)
if status != 0 {
log.LogWarnf("isOverQuota ino[%v] quotaId [%v] size [%v] files[%v] status[%v]", ino, quotaId, size, files, status)
return
}
}
}
return
}
func (mp *metaPartition) getInodeQuota(inode uint64, p *Packet) (err error) {
extend := NewExtend(inode)
quotaInfos := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
var (
value []byte
exist bool
)
treeItem := mp.extendTree.CopyGet(extend)
if treeItem == nil {
goto handleRsp
}
extend = treeItem.(*Extend)
value, exist = extend.Get([]byte(proto.QuotaKey))
if exist {
if err = json.Unmarshal(value, "aInfos.QuotaInfoMap); err != nil {
log.LogErrorf("getInodeQuota inode[%v] Unmarshal quotaInfos fail [%v]", inode, err)
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
}
handleRsp:
response := &proto.GetInodeQuotaResponse{}
log.LogInfof("getInodeQuota indoe %v ,map %v", inode, quotaInfos.QuotaInfoMap)
response.MetaQuotaInfoMap = quotaInfos.QuotaInfoMap
encoded, err := json.Marshal(response)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
return
}
p.PacketOkWithBody(encoded)
return
}
func (mp *metaPartition) getInodeQuotaInfos(inode uint64) (quotaInfos map[uint32]*proto.MetaQuotaInfo, err error) {
log.LogInfof("getInodeQuotaInfos mp[%v] treeLen[%v]", mp.config.PartitionId, mp.extendTree.Len())
treeItem := mp.extendTree.Get(NewExtend(inode))
if treeItem == nil {
return
}
extend := treeItem.(*Extend)
info := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
value, exist := extend.Get([]byte(proto.QuotaKey))
if exist {
if err = json.Unmarshal(value, &info.QuotaInfoMap); err != nil {
log.LogErrorf("getInodeQuota inode[%v] Unmarshal quotaInfos fail [%v]", inode, err)
return
}
quotaInfos = info.QuotaInfoMap
}
log.LogInfof("getInodeQuotaInfos inode[%v] quotaInfos [%v] exist [%v]", inode, quotaInfos, exist)
return
}
func (mp *metaPartition) setInodeQuota(quotaIds []uint32, inode uint64) {
extend := NewExtend(inode)
quotaInfos := &proto.MetaQuotaInfos{
QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
}
for _, quotaId := range quotaIds {
quotaInfo := &proto.MetaQuotaInfo{
RootInode: false,
}
quotaInfos.QuotaInfoMap[quotaId] = quotaInfo
}
value, err := json.Marshal(quotaInfos.QuotaInfoMap)
if err != nil {
log.LogErrorf("setInodeQuota marsha1 quotaInfos [%v] fail [%v]", quotaInfos, err)
return
}
extend.Put([]byte(proto.QuotaKey), value, mp.verSeq)
treeItem := mp.extendTree.CopyGet(extend)
var e *Extend
if treeItem == nil {
mp.extendTree.ReplaceOrInsert(extend, true)
} else {
e = treeItem.(*Extend)
e.Merge(extend, true)
}
log.LogInfof("setInodeQuota inode[%v] quota [%v] success.", inode, quotaIds)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package metanode
import (
"encoding/json"
"fmt"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/auditlog"
"github.com/cubefs/cubefs/util/log"
)
func (mp *metaPartition) TxCreate(req *proto.TxCreateRequest, p *Packet) error {
var err error
txInfo := req.TransactionInfo.GetCopy()
// 1. init tx in tm
ifo, err := mp.txInit(txInfo, p)
if err != nil || ifo == nil {
return err
}
if ifo.TmID != int64(mp.config.PartitionId) {
p.PacketOkReply()
return nil
}
if ifo.State != proto.TxStatePreCommit {
log.LogWarnf("TxCreate: tx is already init, txInfo %s", ifo.String())
p.PacketOkReply()
return nil
}
// 2. add tx to other rm
mp.txInitToRm(ifo, p)
if p.ResultCode != proto.OpOk {
return nil
}
resp := &proto.TxCreateResponse{
TxInfo: ifo,
}
status := proto.OpOk
reply, err := json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
p.PacketErrorWithBody(status, reply)
return nil
}
func (mp *metaPartition) txInitToRm(txInfo *proto.TransactionInfo, p *Packet) {
mpIfos := txInfo.GroupByMp()
statusCh := make(chan uint8, len(mpIfos))
wg := sync.WaitGroup{}
for mpId, ifo := range mpIfos {
if mp.config.PartitionId == mpId {
continue
}
req := &proto.TxCreateRequest{
VolName: mp.config.VolName,
PartitionID: mpId,
TransactionInfo: txInfo,
}
pkt, _ := buildTxPacket(req, mpId, proto.OpMetaTxCreate)
members := ifo.Members
wg.Add(1)
go func() {
defer wg.Done()
status := mp.txProcessor.txManager.txSendToMpWithAddrs(members, pkt)
if status != proto.OpOk {
log.LogWarnf("txInitRm: send to rm failed, addr %s, pkt %s, status %s",
members, string(pkt.Data), proto.GetStatusStr(status))
}
statusCh <- status
}()
}
wg.Wait()
close(statusCh)
for status := range statusCh {
if !canRetry(status) {
p.ResultCode = status
return
}
if status != proto.OpOk {
p.ResultCode = status
return
}
}
p.ResultCode = proto.OpOk
return
}
func canRetry(status uint8) bool {
if status == proto.OpOk || status == proto.OpAgain || status == proto.OpErr {
return true
}
return false
}
func (mp *metaPartition) txInit(txInfo *proto.TransactionInfo, p *Packet) (ifo *proto.TransactionInfo, err error) {
if uint64(txInfo.TmID) == mp.config.PartitionId {
err = mp.initTxInfo(txInfo)
if err != nil {
log.LogWarnf("init tx limited, ifo %v", txInfo)
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
}
val, err := txInfo.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return nil, err
}
status, err := mp.submit(opFSMTxInit, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return nil, err
}
if status.(uint8) != proto.OpOk {
p.ResultCode = status.(uint8)
return nil, fmt.Errorf("init tx by raft failed, %v", proto.GetStatusStr(p.ResultCode))
}
ifo = mp.txProcessor.txManager.getTransaction(txInfo.TxID)
if ifo == nil {
log.LogWarnf("TxCreate: tx is still not exist, info %s", txInfo.String())
p.ResultCode = proto.OpTxInfoNotExistErr
return nil, nil
}
return ifo, nil
}
// TxCommitRM used to commit tx for single TM or RM
func (mp *metaPartition) TxCommitRM(req *proto.TxApplyRMRequest, p *Packet) error {
txInfo := req.TransactionInfo.GetCopy()
ifo := mp.txProcessor.txManager.getTransaction(txInfo.TxID)
if ifo == nil {
log.LogWarnf("TxCommitRM: can't find tx, already rollback or commit, ifo %v", req.TransactionInfo)
p.PacketErrorWithBody(proto.OpTxInfoNotExistErr, []byte(fmt.Sprintf("tx %s is not exist", txInfo.TxID)))
return nil
}
if ifo.Finish() {
log.LogWarnf("TxCommitRM: tx already commit before in rm, tx %v", ifo)
p.ResultCode = proto.OpOk
return nil
}
val, err := ifo.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
status, err := mp.submit(opFSMTxCommitRM, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
p.ResultCode = status.(uint8)
return nil
}
// TxRollbackRM used to rollback tx for single TM or RM
func (mp *metaPartition) TxRollbackRM(req *proto.TxApplyRMRequest, p *Packet) error {
txInfo := req.TransactionInfo.GetCopy()
ifo := mp.txProcessor.txManager.getTransaction(txInfo.TxID)
if ifo == nil {
log.LogWarnf("TxRollbackRM: can't find tx, already rollback or commit, ifo %v", req.TransactionInfo)
p.PacketErrorWithBody(proto.OpTxInfoNotExistErr, []byte(fmt.Sprintf("tx %s is not exist", txInfo.TxID)))
return nil
}
if ifo.Finish() {
log.LogWarnf("TxRollbackRM: tx already commit before in rm, tx %v", ifo)
p.ResultCode = proto.OpOk
return nil
}
val, err := txInfo.Marshal()
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
status, err := mp.submit(opFSMTxRollbackRM, val)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return err
}
p.ResultCode = status.(uint8)
return nil
}
func (mp *metaPartition) TxCommit(req *proto.TxApplyRequest, p *Packet, remoteAddr string) error {
var err error
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogTxOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.TxID, err, time.Since(start).Milliseconds())
}()
}
status, err := mp.txProcessor.txManager.commitTx(req.TxID, false)
if err != nil {
p.PacketErrorWithBody(status, []byte(err.Error()))
return err
}
p.ResultCode = status
return err
}
func (mp *metaPartition) TxRollback(req *proto.TxApplyRequest, p *Packet, remoteAddr string) error {
var err error
start := time.Now()
if mp.IsEnableAuditLog() {
defer func() {
auditlog.LogTxOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.TxID, err, time.Since(start).Milliseconds())
}()
}
status, err := mp.txProcessor.txManager.rollbackTx(req.TxID, false)
if err != nil {
p.PacketErrorWithBody(status, []byte(err.Error()))
return err
}
p.ResultCode = status
return err
}
func (mp *metaPartition) TxGetCnt() (uint64, uint64, uint64) {
txCnt := mp.txProcessor.txManager.txTree.Len()
rbInoCnt := mp.txProcessor.txResource.txRbInodeTree.Len()
rbDenCnt := mp.txProcessor.txResource.txRbDentryTree.Len()
return uint64(txCnt), uint64(rbInoCnt), uint64(rbDenCnt)
}
func (mp *metaPartition) TxGetTree() (*BTree, *BTree, *BTree) {
tx := mp.txProcessor.txManager.txTree.GetTree()
rbIno := mp.txProcessor.txResource.txRbInodeTree.GetTree()
rbDen := mp.txProcessor.txResource.txRbDentryTree.GetTree()
return tx, rbIno, rbDen
}
func (mp *metaPartition) TxGetInfo(req *proto.TxGetInfoRequest, p *Packet) (err error) {
var status uint8
txItem := proto.NewTxInfoBItem(req.TxID)
var txInfo *proto.TransactionInfo
if item := mp.txProcessor.txManager.txTree.Get(txItem); item != nil {
txInfo = item.(*proto.TransactionInfo)
status = proto.OpOk
} else {
status = proto.OpTxInfoNotExistErr
}
var reply []byte
if status == proto.OpOk {
resp := &proto.TxGetInfoResponse{
TxInfo: txInfo,
}
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return err
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/binary"
"encoding/json"
"fmt"
"sync/atomic"
"github.com/cubefs/cubefs/proto"
)
func (mp *metaPartition) GetUniqID(p *Packet, num uint32) (err error) {
idBuf := make([]byte, 4)
binary.BigEndian.PutUint32(idBuf, num)
resp, err := mp.submit(opFSMUniqID, idBuf)
if err != nil {
p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
return
}
var (
status = proto.OpErr
reply []byte
)
idResp := resp.(*UniqIdResp)
if idResp.Status == proto.OpOk {
resp := &GetUniqIDResp{
Start: idResp.Start,
}
status = proto.OpOk
reply, err = json.Marshal(resp)
if err != nil {
status = proto.OpErr
reply = []byte(err.Error())
}
}
p.PacketErrorWithBody(status, reply)
return
}
func (mp *metaPartition) allocateUniqID(num uint32) (start, end uint64) {
for {
// cur is the last allocated id
cur := mp.GetUniqId()
start = cur + 1
end := cur + uint64(num)
if atomic.CompareAndSwapUint64(&mp.config.UniqId, cur, end) {
return start, end
}
}
}
func (mp *metaPartition) uniqCheckerEvict() (left int, evict int, err error) {
checker := mp.uniqChecker
left, idx, op := checker.evictIndex()
if op == nil {
return left, 0, nil
}
fsmReq := &fsmEvictUniqCheckerRequest{
Idx: idx,
UniqID: op.uniqid,
}
reqBytes, err := json.Marshal(fsmReq)
if err != nil {
panic(err)
}
_, err = mp.submit(opFSMUniqCheckerEvict, reqBytes)
return left, idx + 1, err
}
var (
inodeOnceSize = 16
newInodeOnceSize = 24
)
type InodeOnce struct {
UniqID uint64
Inode uint64 // Inode ID
VerSeq uint64
}
func (i *InodeOnce) Marshal() (val []byte) {
val = make([]byte, newInodeOnceSize)
binary.BigEndian.PutUint64(val[0:8], i.UniqID)
binary.BigEndian.PutUint64(val[8:16], i.Inode)
binary.BigEndian.PutUint64(val[16:24], i.VerSeq)
return val
}
func InodeOnceUnlinkMarshal(req *UnlinkInoReq) []byte {
inoOnce := &InodeOnce{
UniqID: req.UniqID,
Inode: req.Inode,
VerSeq: req.VerSeq,
}
return inoOnce.Marshal()
}
func InodeOnceLinkMarshal(req *LinkInodeReq) []byte {
inoOnce := &InodeOnce{
UniqID: req.UniqID,
Inode: req.Inode,
}
return inoOnce.Marshal()
}
func InodeOnceUnmarshal(val []byte) (i *InodeOnce, err error) {
i = &InodeOnce{}
if len(val) < inodeOnceSize {
return i, fmt.Errorf("size incorrect")
}
i.UniqID = binary.BigEndian.Uint64(val[0:8])
i.Inode = binary.BigEndian.Uint64(val[8:16])
if len(val) == 24 {
i.VerSeq = binary.BigEndian.Uint64(val[16:24])
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bufio"
"encoding/binary"
"encoding/json"
"fmt"
"hash/crc32"
"io"
"os"
"path"
"strings"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
mmap "github.com/edsrzf/mmap-go"
)
const (
snapshotDir = "snapshot"
snapshotDirTmp = ".snapshot"
snapshotBackup = ".snapshot_backup"
inodeFile = "inode"
dentryFile = "dentry"
extendFile = "extend"
multipartFile = "multipart"
txInfoFile = "tx_info"
txRbInodeFile = "tx_rb_inode"
txRbDentryFile = "tx_rb_dentry"
applyIDFile = "apply"
TxIDFile = "transactionID"
SnapshotSign = ".sign"
metadataFile = "meta"
metadataFileTmp = ".meta"
uniqIDFile = "uniqID"
uniqCheckerFile = "uniqChecker"
verdataFile = "multiVer"
StaleMetadataSuffix = ".old"
StaleMetadataTimeFormat = "20060102150405.000000000"
verdataInitFile = "multiVerInitFile"
)
func (mp *metaPartition) loadMetadata() (err error) {
metaFile := path.Join(mp.config.RootDir, metadataFile)
fp, err := os.OpenFile(metaFile, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadMetadata]: OpenFile %s", err.Error())
return
}
defer fp.Close()
data, err := io.ReadAll(fp)
if err != nil || len(data) == 0 {
err = errors.NewErrorf("[loadMetadata]: ReadFile %s, data: %s", err.Error(),
string(data))
return
}
mConf := &MetaPartitionConfig{}
if err = json.Unmarshal(data, mConf); err != nil {
err = errors.NewErrorf("[loadMetadata]: Unmarshal MetaPartitionConfig %s",
err.Error())
return
}
if mConf.checkMeta() != nil {
return
}
mp.config.PartitionId = mConf.PartitionId
mp.config.VolName = mConf.VolName
mp.config.Start = mConf.Start
mp.config.End = mConf.End
mp.config.Peers = mConf.Peers
mp.config.Cursor = mp.config.Start
mp.config.UniqId = 0
mp.uidManager = NewUidMgr(mp.config.VolName, mp.config.PartitionId)
mp.mqMgr = NewQuotaManager(mp.config.VolName, mp.config.PartitionId)
log.LogInfof("loadMetadata: load complete: partitionID(%v) volume(%v) range(%v,%v) cursor(%v)",
mp.config.PartitionId, mp.config.VolName, mp.config.Start, mp.config.End, mp.config.Cursor)
return
}
func (mp *metaPartition) loadInode(rootDir string, crc uint32) (err error) {
var numInodes uint64
defer func() {
if err == nil {
log.LogInfof("loadInode: load complete: partitonID(%v) volume(%v) numInodes(%v)",
mp.config.PartitionId, mp.config.VolName, numInodes)
}
}()
filename := path.Join(rootDir, inodeFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadInode] Stat: %s", err.Error())
return
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadInode] OpenFile: %s", err.Error())
return
}
defer fp.Close()
reader := bufio.NewReaderSize(fp, 4*1024*1024)
inoBuf := make([]byte, 4)
crcCheck := crc32.NewIEEE()
for {
inoBuf = inoBuf[:4]
// first read length
_, err = io.ReadFull(reader, inoBuf)
if err != nil {
if err == io.EOF {
err = nil
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("[loadInode]: check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
return
}
err = errors.NewErrorf("[loadInode] ReadHeader: %s", err.Error())
return
}
// length crc
if _, err = crcCheck.Write(inoBuf); err != nil {
return err
}
length := binary.BigEndian.Uint32(inoBuf)
// next read body
if uint32(cap(inoBuf)) >= length {
inoBuf = inoBuf[:length]
} else {
inoBuf = make([]byte, length)
}
_, err = io.ReadFull(reader, inoBuf)
if err != nil {
err = errors.NewErrorf("[loadInode] ReadBody: %s", err.Error())
return
}
ino := NewInode(0, 0)
if err = ino.Unmarshal(inoBuf); err != nil {
err = errors.NewErrorf("[loadInode] Unmarshal: %s", err.Error())
return
}
mp.acucumUidSizeByLoad(ino)
// data crc
if _, err = crcCheck.Write(inoBuf); err != nil {
return err
}
mp.size += ino.Size
mp.fsmCreateInode(ino)
mp.checkAndInsertFreeList(ino)
if mp.config.Cursor < ino.Inode {
mp.config.Cursor = ino.Inode
}
numInodes += 1
}
}
// Load dentry from the dentry snapshot.
func (mp *metaPartition) loadDentry(rootDir string, crc uint32) (err error) {
var numDentries uint64
defer func() {
if err == nil {
log.LogInfof("loadDentry: load complete: partitonID(%v) volume(%v) numDentries(%v)",
mp.config.PartitionId, mp.config.VolName, numDentries)
}
}()
filename := path.Join(rootDir, dentryFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadDentry] Stat: %s", err.Error())
return
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadDentry] OpenFile: %s", err.Error())
return
}
defer fp.Close()
reader := bufio.NewReaderSize(fp, 4*1024*1024)
dentryBuf := make([]byte, 4)
crcCheck := crc32.NewIEEE()
for {
dentryBuf = dentryBuf[:4]
// First Read 4byte header length
_, err = io.ReadFull(reader, dentryBuf)
if err != nil {
if err == io.EOF {
err = nil
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("[loadDentry]: check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
return
}
err = errors.NewErrorf("[loadDentry] ReadHeader: %s", err.Error())
return
}
if _, err = crcCheck.Write(dentryBuf); err != nil {
return err
}
length := binary.BigEndian.Uint32(dentryBuf)
// next read body
if uint32(cap(dentryBuf)) >= length {
dentryBuf = dentryBuf[:length]
} else {
dentryBuf = make([]byte, length)
}
_, err = io.ReadFull(reader, dentryBuf)
if err != nil {
err = errors.NewErrorf("[loadDentry]: ReadBody: %s", err.Error())
return
}
dentry := &Dentry{}
if err = dentry.Unmarshal(dentryBuf); err != nil {
err = errors.NewErrorf("[loadDentry] Unmarshal: %s", err.Error())
return
}
if status := mp.fsmCreateDentry(dentry, true); status != proto.OpOk {
err = errors.NewErrorf("[loadDentry] createDentry dentry: %v, resp code: %d", dentry, status)
return
}
if _, err = crcCheck.Write(dentryBuf); err != nil {
return err
}
numDentries += 1
}
}
func (mp *metaPartition) loadExtend(rootDir string, crc uint32) (err error) {
filename := path.Join(rootDir, extendFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadExtend] Stat: %s", err.Error())
return err
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadExtend] OpenFile: %s", err.Error())
return err
}
defer func() {
_ = fp.Close()
}()
var mem mmap.MMap
if mem, err = mmap.Map(fp, mmap.RDONLY, 0); err != nil {
return err
}
defer func() {
_ = mem.Unmap()
}()
var offset, n int
// read number of extends
var numExtends uint64
numExtends, n = binary.Uvarint(mem)
offset += n
varintTmp := make([]byte, binary.MaxVarintLen64)
// write number of extends
n = binary.PutUvarint(varintTmp, numExtends)
crcCheck := crc32.NewIEEE()
if _, err = crcCheck.Write(varintTmp[:n]); err != nil {
return
}
for i := uint64(0); i < numExtends; i++ {
// read length
var numBytes uint64
numBytes, n = binary.Uvarint(mem[offset:])
offset += n
var extend *Extend
if extend, err = NewExtendFromBytes(mem[offset : offset+int(numBytes)]); err != nil {
return err
}
if _, err = crcCheck.Write(mem[offset-n : offset]); err != nil {
return err
}
// log.LogDebugf("loadExtend: new extend from bytes: partitionID (%v) volume(%v) inode[%v]",
// mp.config.PartitionId, mp.config.VolName, extend.inode)
_ = mp.fsmSetXAttr(extend)
if _, err = crcCheck.Write(mem[offset : offset+int(numBytes)]); err != nil {
return
}
offset += int(numBytes)
mp.statisticExtendByLoad(extend)
}
log.LogInfof("loadExtend: load complete: partitionID(%v) volume(%v) numExtends(%v) filename(%v)",
mp.config.PartitionId, mp.config.VolName, numExtends, filename)
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("loadExtend: check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
return nil
}
func (mp *metaPartition) loadMultipart(rootDir string, crc uint32) (err error) {
filename := path.Join(rootDir, multipartFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadMultipart] Stat: %s", err.Error())
return err
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadMultipart] OpenFile: %s", err.Error())
return err
}
defer func() {
_ = fp.Close()
}()
var mem mmap.MMap
if mem, err = mmap.Map(fp, mmap.RDONLY, 0); err != nil {
return err
}
defer func() {
_ = mem.Unmap()
}()
var offset, n int
// read number of multipart
var numMultiparts uint64
numMultiparts, n = binary.Uvarint(mem)
varintTmp := make([]byte, binary.MaxVarintLen64)
// write number of multipart
n = binary.PutUvarint(varintTmp, numMultiparts)
crcCheck := crc32.NewIEEE()
if _, err = crcCheck.Write(varintTmp[:n]); err != nil {
return
}
offset += n
for i := uint64(0); i < numMultiparts; i++ {
// read length
var numBytes uint64
numBytes, n = binary.Uvarint(mem[offset:])
offset += n
if _, err = crcCheck.Write(mem[offset-n : offset]); err != nil {
return err
}
var multipart *Multipart
multipart = MultipartFromBytes(mem[offset : offset+int(numBytes)])
log.LogDebugf("loadMultipart: create multipart from bytes: partitionID(%v) multipartID(%v)", mp.config.PartitionId, multipart.id)
mp.fsmCreateMultipart(multipart)
offset += int(numBytes)
if _, err = crcCheck.Write(mem[offset-int(numBytes) : offset]); err != nil {
return err
}
}
log.LogInfof("loadMultipart: load complete: partitionID(%v) numMultiparts(%v) filename(%v)",
mp.config.PartitionId, numMultiparts, filename)
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("[loadMultipart] check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
return nil
}
func (mp *metaPartition) loadApplyID(rootDir string) (err error) {
filename := path.Join(rootDir, applyIDFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadApplyID]: Stat %s", err.Error())
return
}
data, err := os.ReadFile(filename)
if err != nil {
err = errors.NewErrorf("[loadApplyID] ReadFile: %s", err.Error())
return
}
if len(data) == 0 {
err = errors.NewErrorf("[loadApplyID]: ApplyID is empty")
return
}
var cursor uint64
if strings.Contains(string(data), "|") {
_, err = fmt.Sscanf(string(data), "%d|%d", &mp.applyID, &cursor)
} else {
_, err = fmt.Sscanf(string(data), "%d", &mp.applyID)
}
if err != nil {
err = errors.NewErrorf("[loadApplyID] ReadApplyID: %s", err.Error())
return
}
mp.storedApplyId = mp.applyID
if cursor > mp.GetCursor() {
atomic.StoreUint64(&mp.config.Cursor, cursor)
}
log.LogInfof("loadApplyID: load complete: partitionID(%v) volume(%v) applyID(%v) cursor(%v) filename(%v)",
mp.config.PartitionId, mp.config.VolName, mp.applyID, mp.config.Cursor, filename)
return
}
func (mp *metaPartition) loadTxRbDentry(rootDir string, crc uint32) (err error) {
var numTxRbDentry uint64
defer func() {
if err == nil {
log.LogInfof("loadTxRbDentry: load complete: partitonID(%v) volume(%v) numInodes(%v)",
mp.config.PartitionId, mp.config.VolName, numTxRbDentry)
}
}()
filename := path.Join(rootDir, txRbDentryFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadTxRbDentry] Stat: %s", err.Error())
return
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadTxRbDentry] OpenFile: %s", err.Error())
return
}
defer fp.Close()
reader := bufio.NewReaderSize(fp, 4*1024*1024)
txBuf := make([]byte, 4)
crcCheck := crc32.NewIEEE()
for {
txBuf = txBuf[:4]
// first read length
_, err = io.ReadFull(reader, txBuf)
if err != nil {
if err == io.EOF {
err = nil
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("[loadTxRbDentry]: check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
return
}
err = errors.NewErrorf("[loadTxRbDentry] ReadHeader: %s", err.Error())
return
}
// length crc
if _, err = crcCheck.Write(txBuf); err != nil {
return err
}
length := binary.BigEndian.Uint32(txBuf)
// next read body
if uint32(cap(txBuf)) >= length {
txBuf = txBuf[:length]
} else {
txBuf = make([]byte, length)
}
_, err = io.ReadFull(reader, txBuf)
if err != nil {
err = errors.NewErrorf("[loadTxRbDentry] ReadBody: %s", err.Error())
return
}
txRbDentry := NewTxRollbackDentry(nil, nil, 0)
if err = txRbDentry.Unmarshal(txBuf); err != nil {
err = errors.NewErrorf("[loadTxRbDentry] Unmarshal: %s", err.Error())
return
}
// data crc
if _, err = crcCheck.Write(txBuf); err != nil {
return err
}
// mp.txProcessor.txResource.txRollbackDentries[txRbDentry.txDentryInfo.GetKey()] = txRbDentry
mp.txProcessor.txResource.txRbDentryTree.ReplaceOrInsert(txRbDentry, true)
numTxRbDentry++
}
}
func (mp *metaPartition) loadTxRbInode(rootDir string, crc uint32) (err error) {
var numTxRbInode uint64
defer func() {
if err == nil {
log.LogInfof("loadTxRbInode: load complete: partitonID(%v) volume(%v) numInodes(%v)",
mp.config.PartitionId, mp.config.VolName, numTxRbInode)
}
}()
filename := path.Join(rootDir, txRbInodeFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadTxRbInode] Stat: %s", err.Error())
return
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadTxRbInode] OpenFile: %s", err.Error())
return
}
defer fp.Close()
reader := bufio.NewReaderSize(fp, 4*1024*1024)
txBuf := make([]byte, 4)
crcCheck := crc32.NewIEEE()
for {
txBuf = txBuf[:4]
// first read length
_, err = io.ReadFull(reader, txBuf)
if err != nil {
if err == io.EOF {
err = nil
return
}
err = errors.NewErrorf("[loadTxRbInode] ReadHeader: %s", err.Error())
return
}
// length crc
if _, err = crcCheck.Write(txBuf); err != nil {
return err
}
length := binary.BigEndian.Uint32(txBuf)
// next read body
if uint32(cap(txBuf)) >= length {
txBuf = txBuf[:length]
} else {
txBuf = make([]byte, length)
}
_, err = io.ReadFull(reader, txBuf)
if err != nil {
err = errors.NewErrorf("[loadTxRbInode] ReadBody: %s", err.Error())
return
}
txRbInode := NewTxRollbackInode(nil, []uint32{}, nil, 0)
if err = txRbInode.Unmarshal(txBuf); err != nil {
err = errors.NewErrorf("[loadTxRbInode] Unmarshal: %s", err.Error())
return
}
// data crc
if _, err = crcCheck.Write(txBuf); err != nil {
return err
}
mp.txProcessor.txResource.txRbInodeTree.ReplaceOrInsert(txRbInode, true)
numTxRbInode++
}
}
func (mp *metaPartition) loadTxInfo(rootDir string, crc uint32) (err error) {
var numTxInfos uint64
defer func() {
if err == nil {
log.LogInfof("loadTxInfo: load complete: partitonID(%v) volume(%v) numInodes(%v)",
mp.config.PartitionId, mp.config.VolName, numTxInfos)
}
}()
filename := path.Join(rootDir, txInfoFile)
if _, err = os.Stat(filename); err != nil {
err = errors.NewErrorf("[loadTxInfo] Stat: %s", err.Error())
return
}
fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
if err != nil {
err = errors.NewErrorf("[loadTxInfo] OpenFile: %s", err.Error())
return
}
defer fp.Close()
reader := bufio.NewReaderSize(fp, 4*1024*1024)
txBuf := make([]byte, 4)
crcCheck := crc32.NewIEEE()
for {
txBuf = txBuf[:4]
// first read length
_, err = io.ReadFull(reader, txBuf)
if err != nil {
if err == io.EOF {
err = nil
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("[loadTxInfo]: check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
return
}
err = errors.NewErrorf("[loadTxInfo] ReadHeader: %s", err.Error())
return
}
// length crc
if _, err = crcCheck.Write(txBuf); err != nil {
return err
}
length := binary.BigEndian.Uint32(txBuf)
// next read body
if uint32(cap(txBuf)) >= length {
txBuf = txBuf[:length]
} else {
txBuf = make([]byte, length)
}
_, err = io.ReadFull(reader, txBuf)
if err != nil {
err = errors.NewErrorf("[loadTxInfo] ReadBody: %s", err.Error())
return
}
txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
if err = txInfo.Unmarshal(txBuf); err != nil {
err = errors.NewErrorf("[loadTxInfo] Unmarshal: %s", err.Error())
return
}
// data crc
if _, err = crcCheck.Write(txBuf); err != nil {
return err
}
mp.txProcessor.txManager.addTxInfo(txInfo)
numTxInfos++
}
}
func (mp *metaPartition) loadTxID(rootDir string) (err error) {
filename := path.Join(rootDir, TxIDFile)
if _, err = os.Stat(filename); err != nil {
err = nil
return
}
data, err := os.ReadFile(filename)
if err != nil {
err = errors.NewErrorf("[loadTxID] OpenFile: %s", err.Error())
return
}
if len(data) == 0 {
err = errors.NewErrorf("[loadTxID]: TxID is empty")
return
}
var txId uint64
_, err = fmt.Sscanf(string(data), "%d", &txId)
if err != nil {
err = errors.NewErrorf("[loadTxID] ReadTxID: %s", err.Error())
return
}
if txId > mp.txProcessor.txManager.txIdAlloc.getTransactionID() {
mp.txProcessor.txManager.txIdAlloc.setTransactionID(txId)
}
log.LogInfof("loadTxID: load complete: partitionID(%v) volume(%v) txId(%v) filename(%v)",
mp.config.PartitionId, mp.config.VolName, mp.txProcessor.txManager.txIdAlloc.getTransactionID(), filename)
return
}
func (mp *metaPartition) loadUniqID(rootDir string) (err error) {
filename := path.Join(rootDir, uniqIDFile)
if _, err = os.Stat(filename); err != nil {
err = nil
return
}
data, err := os.ReadFile(filename)
if err != nil {
err = errors.NewErrorf("[loadUniqID] OpenFile: %s", err.Error())
return
}
if len(data) == 0 {
err = errors.NewErrorf("[loadUniqID]: uniqID is empty")
return
}
var uniqId uint64
_, err = fmt.Sscanf(string(data), "%d", &uniqId)
if err != nil {
err = errors.NewErrorf("[loadUniqID] Read uniqID: %s", err.Error())
return
}
if uniqId > mp.GetUniqId() {
atomic.StoreUint64(&mp.config.UniqId, uniqId)
}
log.LogInfof("loadUniqID: load complete: partitionID(%v) volume(%v) uniqID(%v) filename(%v)",
mp.config.PartitionId, mp.config.VolName, mp.GetUniqId(), filename)
return
}
func (mp *metaPartition) loadUniqChecker(rootDir string, crc uint32) (err error) {
log.LogInfof("loadUniqChecker partition(%v) begin", mp.config.PartitionId)
filename := path.Join(rootDir, uniqCheckerFile)
if _, err = os.Stat(filename); err != nil {
log.LogErrorf("loadUniqChecker get file %s err(%s)", filename, err)
err = nil
return
}
data, err := os.ReadFile(filename)
if err != nil {
log.LogErrorf("loadUniqChecker read file %s err(%s)", filename, err)
err = errors.NewErrorf("[loadUniqChecker] OpenFile: %v", err.Error())
return
}
if err = mp.uniqChecker.UnMarshal(data); err != nil {
log.LogErrorf("loadUniqChecker UnMarshal err(%s)", err)
err = errors.NewErrorf("[loadUniqChecker] Unmarshal: %v", err.Error())
return
}
crcCheck := crc32.NewIEEE()
if _, err = crcCheck.Write(data); err != nil {
log.LogErrorf("loadUniqChecker write to crcCheck failed: %s", err)
return err
}
if res := crcCheck.Sum32(); res != crc {
log.LogErrorf("[loadUniqChecker]: check crc mismatch, expected[%d], actual[%d]", crc, res)
return ErrSnapshotCrcMismatch
}
log.LogInfof("loadUniqChecker partition(%v) complete", mp.config.PartitionId)
return
}
func (mp *metaPartition) loadMultiVer(rootDir string, crc uint32) (err error) {
filename := path.Join(rootDir, verdataFile)
if _, err = os.Stat(filename); err != nil {
err = nil
return
}
data, err := os.ReadFile(filename)
if err != nil {
if err == os.ErrNotExist {
err = nil
return
}
err = errors.NewErrorf("[loadMultiVer] OpenFile: %s", err.Error())
return
}
if len(data) == 0 {
err = errors.NewErrorf("[loadMultiVer]: ApplyID is empty")
return
}
var (
verData string
applyId uint64
)
if strings.Contains(string(data), "|") {
_, err = fmt.Sscanf(string(data), "%d|%s", &applyId, &verData)
} else {
_, err = fmt.Sscanf(string(data), "%d", &applyId)
}
if err != nil {
err = errors.NewErrorf("[loadMultiVer] ReadVerList: %s", err.Error())
return
}
var verList []*proto.VolVersionInfo
if err = json.Unmarshal([]byte(verData), &verList); err != nil {
err = errors.NewErrorf("[loadMultiVer] ReadVerList: %s verData(%v) applyId %v", verList, verData, applyId)
return
}
var byteData []byte
if byteData, err = json.Marshal(verList); err != nil {
return
}
sign := crc32.NewIEEE()
if _, err = sign.Write(byteData); err != nil {
return
}
if crc != sign.Sum32() {
return fmt.Errorf("partitionID(%v) volume(%v) calc crc %v not equal with disk %v", mp.config.PartitionId, mp.config.VolName, sign.Sum32(), crc)
}
mp.multiVersionList.VerList = verList
mp.verSeq = mp.multiVersionList.GetLastVer()
log.LogInfof("loadMultiVer: updateVerList load complete: partitionID(%v) volume(%v) applyID(%v) filename(%v) verlist (%v) crc (%v) mp Ver(%v)",
mp.config.PartitionId, mp.config.VolName, mp.applyID, filename, mp.multiVersionList.VerList, crc, mp.verSeq)
return
}
func (mp *metaPartition) storeMultiVersion(rootDir string, sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, verdataFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
fp.Close()
}()
var verData []byte
if verData, err = json.Marshal(sm.multiVerList); err != nil {
return
}
sign := crc32.NewIEEE()
if _, err = sign.Write(verData); err != nil {
return
}
crc = sign.Sum32()
if _, err = fp.WriteString(fmt.Sprintf("%d|%s", sm.applyIndex, string(verData))); err != nil {
return
}
log.LogInfof("storeMultiVersion: store complete: partitionID(%v) volume(%v) applyID(%v) verData(%v) crc(%v)",
mp.config.PartitionId, mp.config.VolName, sm.applyIndex, string(verData), crc)
return
}
func (mp *metaPartition) renameStaleMetadata() (err error) {
if _, err = os.Stat(mp.config.RootDir); err != nil {
if os.IsNotExist(err) {
return nil
}
}
curTime := time.Now().Format(StaleMetadataTimeFormat)
staleMetaDirName := mp.config.RootDir + "_" + curTime + StaleMetadataSuffix
if err = os.Rename(mp.config.RootDir, staleMetaDirName); err != nil {
return err
}
return nil
}
func (mp *metaPartition) persistMetadata() (err error) {
if err = mp.config.checkMeta(); err != nil {
err = errors.NewErrorf("[persistMetadata]->%s", err.Error())
return
}
// TODO Unhandled errors
os.MkdirAll(mp.config.RootDir, 0o755)
filename := path.Join(mp.config.RootDir, metadataFileTmp)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
// TODO Unhandled errors
fp.Sync()
fp.Close()
os.Remove(filename)
}()
data, err := json.Marshal(mp.config)
if err != nil {
return
}
if _, err = fp.Write(data); err != nil {
return
}
if err = os.Rename(filename, path.Join(mp.config.RootDir, metadataFile)); err != nil {
return
}
log.LogInfof("persistMetata: persist complete: partitionID(%v) volume(%v) range(%v,%v) cursor(%v)",
mp.config.PartitionId, mp.config.VolName, mp.config.Start, mp.config.End, mp.config.Cursor)
return
}
func (mp *metaPartition) storeApplyID(rootDir string, sm *storeMsg) (err error) {
filename := path.Join(rootDir, applyIDFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
fp.Close()
}()
cursor := mp.GetCursor()
if _, err = fp.WriteString(fmt.Sprintf("%d|%d", sm.applyIndex, cursor)); err != nil {
return
}
log.LogWarnf("storeApplyID: store complete: partitionID(%v) volume(%v) applyID(%v) cursor(%v)",
mp.config.PartitionId, mp.config.VolName, sm.applyIndex, cursor)
return
}
func (mp *metaPartition) storeTxID(rootDir string, sm *storeMsg) (err error) {
filename := path.Join(rootDir, TxIDFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
fp.Close()
}()
if _, err = fp.WriteString(fmt.Sprintf("%d", sm.txId)); err != nil {
return
}
log.LogInfof("storeTxID: store complete: partitionID(%v) volume(%v) txId(%v)",
mp.config.PartitionId, mp.config.VolName, sm.txId)
return
}
func (mp *metaPartition) storeTxRbDentry(rootDir string, sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, txRbDentryFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
// TODO Unhandled errors
fp.Close()
}()
var data []byte
lenBuf := make([]byte, 4)
sign := crc32.NewIEEE()
sm.txRbDentryTree.Ascend(func(i BtreeItem) bool {
rbDentry := i.(*TxRollbackDentry)
if data, err = rbDentry.Marshal(); err != nil {
return false
}
binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
if _, err = fp.Write(lenBuf); err != nil {
return false
}
if _, err = sign.Write(lenBuf); err != nil {
return false
}
if _, err = fp.Write(data); err != nil {
return false
}
if _, err = sign.Write(data); err != nil {
return false
}
return true
})
crc = sign.Sum32()
log.LogInfof("storeTxRbDentry: store complete: partitoinID(%v) volume(%v) numRbDentry(%v) crc(%v)",
mp.config.PartitionId, mp.config.VolName, sm.txRbDentryTree.Len(), crc)
return
}
func (mp *metaPartition) storeTxRbInode(rootDir string, sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, txRbInodeFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
// TODO Unhandled errors
fp.Close()
}()
var data []byte
lenBuf := make([]byte, 4)
sign := crc32.NewIEEE()
sm.txRbInodeTree.Ascend(func(i BtreeItem) bool {
rbInode := i.(*TxRollbackInode)
if data, err = rbInode.Marshal(); err != nil {
return false
}
binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
if _, err = fp.Write(lenBuf); err != nil {
return false
}
if _, err = sign.Write(lenBuf); err != nil {
return false
}
if _, err = fp.Write(data); err != nil {
return false
}
if _, err = sign.Write(data); err != nil {
return false
}
return true
})
crc = sign.Sum32()
log.LogInfof("storeTxRbInode: store complete: partitoinID(%v) volume(%v) numRbinode[%v] crc(%v)",
mp.config.PartitionId, mp.config.VolName, sm.txRbInodeTree.Len(), crc)
return
}
func (mp *metaPartition) storeTxInfo(rootDir string, sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, txInfoFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
// TODO Unhandled errors
fp.Close()
}()
var data []byte
lenBuf := make([]byte, 4)
sign := crc32.NewIEEE()
sm.txTree.Ascend(func(i BtreeItem) bool {
tx := i.(*proto.TransactionInfo)
if data, err = tx.Marshal(); err != nil {
return false
}
binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
if _, err = fp.Write(lenBuf); err != nil {
return false
}
if _, err = sign.Write(lenBuf); err != nil {
return false
}
if _, err = fp.Write(data); err != nil {
return false
}
if _, err = sign.Write(data); err != nil {
return false
}
return true
})
crc = sign.Sum32()
log.LogInfof("storeTxInfo: store complete: partitoinID(%v) volume(%v) numTxs(%v) crc(%v)",
mp.config.PartitionId, mp.config.VolName, sm.txTree.Len(), crc)
return
}
func (mp *metaPartition) storeInode(rootDir string,
sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, inodeFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
// TODO Unhandled errors
fp.Close()
}()
size := uint64(0)
var data []byte
lenBuf := make([]byte, 4)
sign := crc32.NewIEEE()
sm.inodeTree.Ascend(func(i BtreeItem) bool {
ino := i.(*Inode)
if sm.uidRebuild {
mp.acucumUidSizeByStore(ino)
}
if data, err = ino.Marshal(); err != nil {
return false
}
size += ino.Size
mp.fileStats(ino)
// set length
binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
if _, err = fp.Write(lenBuf); err != nil {
return false
}
if _, err = sign.Write(lenBuf); err != nil {
return false
}
// set body
if _, err = fp.Write(data); err != nil {
return false
}
if _, err = sign.Write(data); err != nil {
return false
}
return true
})
mp.acucumRebuildFin(sm.uidRebuild)
crc = sign.Sum32()
mp.size = size
log.LogInfof("storeInode: store complete: partitoinID(%v) volume(%v) numInodes(%v) crc(%v), size (%d)",
mp.config.PartitionId, mp.config.VolName, sm.inodeTree.Len(), crc, size)
return
}
func (mp *metaPartition) storeDentry(rootDir string,
sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, dentryFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
// TODO Unhandled errors
fp.Close()
}()
var data []byte
lenBuf := make([]byte, 4)
sign := crc32.NewIEEE()
sm.dentryTree.Ascend(func(i BtreeItem) bool {
dentry := i.(*Dentry)
data, err = dentry.Marshal()
if err != nil {
return false
}
// set length
binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
if _, err = fp.Write(lenBuf); err != nil {
return false
}
if _, err = sign.Write(lenBuf); err != nil {
return false
}
if _, err = fp.Write(data); err != nil {
return false
}
if _, err = sign.Write(data); err != nil {
return false
}
return true
})
crc = sign.Sum32()
log.LogInfof("storeDentry: store complete: partitoinID(%v) volume(%v) numDentries(%v) crc(%v)",
mp.config.PartitionId, mp.config.VolName, sm.dentryTree.Len(), crc)
return
}
func (mp *metaPartition) storeExtend(rootDir string, sm *storeMsg) (crc uint32, err error) {
extendTree := sm.extendTree
fp := path.Join(rootDir, extendFile)
var f *os.File
f, err = os.OpenFile(fp, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
if err != nil {
return
}
log.LogDebugf("storeExtend: store start: partitoinID(%v) volume(%v) numInodes(%v) extends(%v)",
mp.config.PartitionId, mp.config.VolName, sm.inodeTree.Len(), sm.extendTree.Len())
defer func() {
closeErr := f.Close()
if err == nil && closeErr != nil {
err = closeErr
}
}()
writer := bufio.NewWriterSize(f, 4*1024*1024)
crc32 := crc32.NewIEEE()
varintTmp := make([]byte, binary.MaxVarintLen64)
var n int
// write number of extends
n = binary.PutUvarint(varintTmp, uint64(extendTree.Len()))
if _, err = writer.Write(varintTmp[:n]); err != nil {
return
}
if _, err = crc32.Write(varintTmp[:n]); err != nil {
return
}
extendTree.Ascend(func(i BtreeItem) bool {
e := i.(*Extend)
var raw []byte
if sm.quotaRebuild {
mp.statisticExtendByStore(e, sm.inodeTree)
}
if raw, err = e.Bytes(); err != nil {
return false
}
// write length
n = binary.PutUvarint(varintTmp, uint64(len(raw)))
if _, err = writer.Write(varintTmp[:n]); err != nil {
return false
}
if _, err = crc32.Write(varintTmp[:n]); err != nil {
return false
}
// write raw
if _, err = writer.Write(raw); err != nil {
return false
}
if _, err = crc32.Write(raw); err != nil {
return false
}
return true
})
log.LogInfof("storeExtend: write data ok: partitoinID(%v) volume(%v) numInodes(%v) extends(%v) quotaRebuild(%v)",
mp.config.PartitionId, mp.config.VolName, sm.inodeTree.Len(), sm.extendTree.Len(), sm.quotaRebuild)
mp.mqMgr.statisticRebuildFin(sm.quotaRebuild)
if err != nil {
return
}
if err = writer.Flush(); err != nil {
return
}
if err = f.Sync(); err != nil {
return
}
crc = crc32.Sum32()
log.LogInfof("storeExtend: store complete: partitoinID(%v) volume(%v) numExtends(%v) crc(%v)",
mp.config.PartitionId, mp.config.VolName, extendTree.Len(), crc)
return
}
func (mp *metaPartition) storeMultipart(rootDir string, sm *storeMsg) (crc uint32, err error) {
multipartTree := sm.multipartTree
fp := path.Join(rootDir, multipartFile)
var f *os.File
f, err = os.OpenFile(fp, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
closeErr := f.Close()
if err == nil && closeErr != nil {
err = closeErr
}
}()
writer := bufio.NewWriterSize(f, 4*1024*1024)
crc32 := crc32.NewIEEE()
varintTmp := make([]byte, binary.MaxVarintLen64)
var n int
// write number of extends
n = binary.PutUvarint(varintTmp, uint64(multipartTree.Len()))
if _, err = writer.Write(varintTmp[:n]); err != nil {
return
}
if _, err = crc32.Write(varintTmp[:n]); err != nil {
return
}
multipartTree.Ascend(func(i BtreeItem) bool {
m := i.(*Multipart)
var raw []byte
if raw, err = m.Bytes(); err != nil {
return false
}
// write length
n = binary.PutUvarint(varintTmp, uint64(len(raw)))
if _, err = writer.Write(varintTmp[:n]); err != nil {
return false
}
if _, err = crc32.Write(varintTmp[:n]); err != nil {
return false
}
// write raw
if _, err = writer.Write(raw); err != nil {
return false
}
if _, err = crc32.Write(raw); err != nil {
return false
}
return true
})
if err != nil {
return
}
if err = writer.Flush(); err != nil {
return
}
if err = f.Sync(); err != nil {
return
}
crc = crc32.Sum32()
log.LogInfof("storeMultipart: store complete: partitoinID(%v) volume(%v) numMultiparts(%v) crc(%v)",
mp.config.PartitionId, mp.config.VolName, multipartTree.Len(), crc)
return
}
func (mp *metaPartition) storeUniqID(rootDir string, sm *storeMsg) (err error) {
filename := path.Join(rootDir, uniqIDFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
fp.Close()
}()
if _, err = fp.WriteString(fmt.Sprintf("%d", sm.uniqId)); err != nil {
return
}
log.LogInfof("storeUniqID: store complete: partitionID(%v) volume(%v) uniqID(%v)",
mp.config.PartitionId, mp.config.VolName, sm.uniqId)
return
}
func (mp *metaPartition) storeUniqChecker(rootDir string, sm *storeMsg) (crc uint32, err error) {
filename := path.Join(rootDir, uniqCheckerFile)
fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.
O_CREATE, 0o755)
if err != nil {
return
}
defer func() {
err = fp.Sync()
fp.Close()
}()
var data []byte
if data, crc, err = sm.uniqChecker.Marshal(); err != nil {
return
}
if _, err = fp.Write(data); err != nil {
return
}
log.LogInfof("storeUniqChecker: store complete: PartitionID(%v) volume(%v) crc(%v)",
mp.config.UniqId, mp.config.VolName, crc)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"encoding/binary"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/cmd/common"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
type storeMsg struct {
command uint32
applyIndex uint64
txId uint64
inodeTree *BTree
dentryTree *BTree
extendTree *BTree
multipartTree *BTree
txTree *BTree
txRbInodeTree *BTree
txRbDentryTree *BTree
quotaRebuild bool
uidRebuild bool
uniqId uint64
uniqChecker *uniqChecker
multiVerList []*proto.VolVersionInfo
}
func (mp *metaPartition) startSchedule(curIndex uint64) {
timer := time.NewTimer(time.Hour * 24 * 365)
timer.Stop()
timerCursor := time.NewTimer(intervalToSyncCursor)
scheduleState := common.StateStopped
lastCursor := mp.GetCursor()
dumpFunc := func(msg *storeMsg) {
log.LogWarnf("[startSchedule] partitionId=%d: nowAppID"+
"=%d, applyID=%d", mp.config.PartitionId, curIndex,
msg.applyIndex)
if err := mp.store(msg); err == nil {
// truncate raft log
if mp.raftPartition != nil {
log.LogWarnf("[startSchedule] start trunc, partitionId=%d: nowAppID"+
"=%d, applyID=%d", mp.config.PartitionId, curIndex,
msg.applyIndex)
mp.raftPartition.Truncate(curIndex)
} else {
// maybe happen when start load dentry
log.LogWarnf("[startSchedule] raftPartition is nil so skip" +
" truncate raft log")
}
curIndex = msg.applyIndex
} else {
// retry again
mp.storeChan <- msg
err = errors.NewErrorf("[startSchedule]: dump partition id=%d: %v",
mp.config.PartitionId, err.Error())
log.LogErrorf(err.Error())
exporter.Warning(err.Error())
}
if _, ok := mp.IsLeader(); ok {
timer.Reset(intervalToPersistData)
}
atomic.StoreUint32(&scheduleState, common.StateStopped)
}
go func(stopC chan bool) {
var msgs []*storeMsg
readyChan := make(chan struct{}, 1)
for {
if len(msgs) > 0 {
if atomic.LoadUint32(&scheduleState) == common.StateStopped {
atomic.StoreUint32(&scheduleState, common.StateRunning)
readyChan <- struct{}{}
}
}
select {
case <-stopC:
timer.Stop()
return
case <-readyChan:
var (
maxIdx uint64
maxMsg *storeMsg
)
for _, msg := range msgs {
if curIndex >= msg.applyIndex {
continue
}
if maxIdx < msg.applyIndex {
maxIdx = msg.applyIndex
maxMsg = msg
}
}
if maxMsg != nil {
go dumpFunc(maxMsg)
} else {
if _, ok := mp.IsLeader(); ok {
timer.Reset(intervalToPersistData)
}
atomic.StoreUint32(&scheduleState, common.StateStopped)
}
msgs = msgs[:0]
case msg := <-mp.storeChan:
switch msg.command {
case startStoreTick:
timer.Reset(intervalToPersistData)
case stopStoreTick:
timer.Stop()
case opFSMStoreTick:
msgs = append(msgs, msg)
default:
// do nothing
}
case <-timer.C:
log.LogDebugf("[startSchedule] intervalToPersistData curIndex: %v,apply:%v", curIndex, mp.applyID)
if mp.applyID <= curIndex {
timer.Reset(intervalToPersistData)
continue
}
if _, err := mp.submit(opFSMStoreTick, nil); err != nil {
log.LogErrorf("[startSchedule] raft submit: %s", err.Error())
if _, ok := mp.IsLeader(); ok {
timer.Reset(intervalToPersistData)
}
}
case <-timerCursor.C:
if _, ok := mp.IsLeader(); !ok {
timerCursor.Reset(intervalToSyncCursor)
continue
}
curCursor := mp.GetCursor()
if curCursor == lastCursor {
log.LogDebugf("[startSchedule] partitionId=%d: curCursor[%v]=lastCursor[%v]",
mp.config.PartitionId, curCursor, lastCursor)
timerCursor.Reset(intervalToSyncCursor)
continue
}
Buf := make([]byte, 8)
binary.BigEndian.PutUint64(Buf, curCursor)
if _, err := mp.submit(opFSMSyncCursor, Buf); err != nil {
log.LogErrorf("[startSchedule] raft submit: %s", err.Error())
}
binary.BigEndian.PutUint64(Buf, mp.txProcessor.txManager.txIdAlloc.getTransactionID())
if _, err := mp.submit(opFSMSyncTxID, Buf); err != nil {
log.LogErrorf("[startSchedule] raft submit: %s", err.Error())
}
lastCursor = curCursor
timerCursor.Reset(intervalToSyncCursor)
}
}
}(mp.stopC)
}
func (mp *metaPartition) stop() {
if mp.stopC != nil {
close(mp.stopC)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"fmt"
"os"
"strconv"
"github.com/cubefs/cubefs/raftstore"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// StartRaftServer initializes the address resolver and the raftStore server instance.
func (m *MetaNode) startRaftServer(cfg *config.Config) (err error) {
_, err = os.Stat(m.raftDir)
if err != nil {
if !os.IsNotExist(err) {
return
}
if err = os.MkdirAll(m.raftDir, 0o755); err != nil {
err = errors.NewErrorf("create raft server dir: %s", err.Error())
return
}
}
if m.clusterUuidEnable {
if err = config.CheckOrStoreClusterUuid(m.raftDir, m.clusterUuid, false); err != nil {
log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err)
}
}
heartbeatPort, _ := strconv.Atoi(m.raftHeartbeatPort)
replicaPort, _ := strconv.Atoi(m.raftReplicatePort)
raftConf := &raftstore.Config{
NodeID: m.nodeId,
RaftPath: m.raftDir,
IPAddr: m.localAddr,
HeartbeatPort: heartbeatPort,
ReplicaPort: replicaPort,
TickInterval: m.tickInterval,
RecvBufSize: m.raftRecvBufSize,
NumOfLogsToRetain: m.raftRetainLogs,
}
m.raftStore, err = raftstore.NewRaftStore(raftConf, cfg)
if err != nil {
err = errors.NewErrorf("new raftStore: %s", err.Error())
}
return
}
func (m *MetaNode) stopRaftServer() {
if m.raftStore != nil {
m.raftStore.Stop()
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"fmt"
"io"
"net"
"github.com/xtaci/smux"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
// StartTcpService binds and listens to the specified port.
func (m *MetaNode) startServer() (err error) {
// initialize and start the server.
m.httpStopC = make(chan uint8)
addr := fmt.Sprintf(":%s", m.listen)
if m.bindIp {
addr = fmt.Sprintf("%s:%s", m.localAddr, m.listen)
}
ln, err := net.Listen("tcp", addr)
if err != nil {
return
}
go func(stopC chan uint8) {
defer ln.Close()
for {
conn, err := ln.Accept()
select {
case <-stopC:
return
default:
}
if err != nil {
continue
}
go m.serveConn(conn, stopC)
}
}(m.httpStopC)
log.LogInfof("start server over...")
return
}
func (m *MetaNode) stopServer() {
if m.httpStopC != nil {
defer func() {
if r := recover(); r != nil {
log.LogErrorf("action[StopTcpServer],err:%v", r)
}
}()
close(m.httpStopC)
}
}
// Read data from the specified tcp connection until the connection is closed by the remote or the tcp service is down.
func (m *MetaNode) serveConn(conn net.Conn, stopC chan uint8) {
defer func() {
conn.Close()
m.RemoveConnection()
}()
m.AddConnection()
c := conn.(*net.TCPConn)
c.SetKeepAlive(true)
c.SetNoDelay(true)
remoteAddr := conn.RemoteAddr().String()
for {
select {
case <-stopC:
return
default:
}
p := &Packet{}
if err := p.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
if err != io.EOF {
log.LogError("serve MetaNode: ", err.Error())
}
return
}
if err := m.handlePacket(conn, p, remoteAddr); err != nil {
log.LogErrorf("serve handlePacket fail: %v", err)
}
}
}
func (m *MetaNode) handlePacket(conn net.Conn, p *Packet,
remoteAddr string) (err error) {
// Handle request
err = m.metadataManager.HandleMetadataOperation(conn, p, remoteAddr)
return
}
func (m *MetaNode) startSmuxServer() (err error) {
// initialize and start the server.
m.smuxStopC = make(chan uint8)
ipPort := fmt.Sprintf(":%s", m.listen)
if m.bindIp {
ipPort = fmt.Sprintf("%s:%s", m.localAddr, m.listen)
}
addr := util.ShiftAddrPort(ipPort, smuxPortShift)
ln, err := net.Listen("tcp", addr)
if err != nil {
return
}
go func(stopC chan uint8) {
defer ln.Close()
for {
conn, err := ln.Accept()
select {
case <-stopC:
return
default:
}
if err != nil {
continue
}
go m.serveSmuxConn(conn, stopC)
}
}(m.smuxStopC)
log.LogInfof("start Smux Server over...")
return
}
func (m *MetaNode) stopSmuxServer() {
if smuxPool != nil {
smuxPool.Close()
log.LogDebugf("action[stopSmuxServer] stop smux conn pool")
}
if m.smuxStopC != nil {
defer func() {
if r := recover(); r != nil {
log.LogErrorf("action[stopSmuxServer],err:%v", r)
}
}()
close(m.smuxStopC)
}
}
func (m *MetaNode) serveSmuxConn(conn net.Conn, stopC chan uint8) {
defer func() {
conn.Close()
m.RemoveConnection()
}()
m.AddConnection()
c := conn.(*net.TCPConn)
c.SetKeepAlive(true)
c.SetNoDelay(true)
remoteAddr := conn.RemoteAddr().String()
var sess *smux.Session
var err error
sess, err = smux.Server(conn, smuxPoolCfg.Config)
if err != nil {
log.LogErrorf("action[serveSmuxConn] failed to serve smux connection, err(%v)", err)
return
}
defer sess.Close()
for {
select {
case <-stopC:
return
default:
}
stream, err := sess.AcceptStream()
if err != nil {
if util.FilterSmuxAcceptError(err) != nil {
log.LogErrorf("action[startSmuxService] failed to accept, err: %s", err)
} else {
log.LogInfof("action[startSmuxService] accept done, err: %s", err)
}
break
}
go m.serveSmuxStream(stream, remoteAddr, stopC)
}
}
func (m *MetaNode) serveSmuxStream(stream *smux.Stream, remoteAddr string, stopC chan uint8) {
for {
select {
case <-stopC:
return
default:
}
p := &Packet{}
if err := p.ReadFromConnWithVer(stream, proto.NoReadDeadlineTime); err != nil {
if err != io.EOF {
log.LogError("serve MetaNode: ", err.Error())
}
return
}
if err := m.handlePacket(stream, p, remoteAddr); err != nil {
log.LogErrorf("serve handlePacket fail: %v", err)
}
}
}
package metanode
import (
"bytes"
"encoding/json"
"sync"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util/log"
)
type SortedExtents struct {
sync.RWMutex
eks []proto.ExtentKey
}
func NewSortedExtents() *SortedExtents {
return &SortedExtents{
eks: make([]proto.ExtentKey, 0),
}
}
// attention: only used for deleted eks
func NewSortedExtentsFromEks(eks []proto.ExtentKey) *SortedExtents {
return &SortedExtents{
eks: eks,
}
}
func (se *SortedExtents) String() string {
se.RLock()
data, err := json.Marshal(se.eks)
se.RUnlock()
if err != nil {
return ""
}
return string(data)
}
func (se *SortedExtents) MarshalBinary(v3 bool) ([]byte, error) {
var data []byte
se.RLock()
defer se.RUnlock()
data = make([]byte, 0, proto.ExtentLength*len(se.eks))
for _, ek := range se.eks {
ekdata, err := ek.MarshalBinary(v3)
if err != nil {
return nil, err
}
data = append(data, ekdata...)
}
return data, nil
}
func (se *SortedExtents) UnmarshalBinary(data []byte, v3 bool) (err error, splitMap *sync.Map) {
se.Lock()
defer se.Unlock()
buf := bytes.NewBuffer(data)
for {
var ek proto.ExtentKey
if buf.Len() == 0 {
break
}
if err = ek.UnmarshalBinary(buf, v3); err != nil {
return
}
// Don't use se.Append here, since we need to retain the raw ek order.
se.eks = append(se.eks, ek)
if ek.IsSplit() {
if splitMap == nil {
splitMap = new(sync.Map)
}
val, ok := splitMap.Load(ek.GenerateId())
if !ok {
splitMap.Store(ek.GenerateId(), uint32(1))
continue
}
splitMap.Store(ek.GenerateId(), val.(uint32)+1)
}
}
return
}
func (se *SortedExtents) Append(ek proto.ExtentKey) (deleteExtents []proto.ExtentKey) {
endOffset := ek.FileOffset + uint64(ek.Size)
se.Lock()
defer se.Unlock()
if len(se.eks) <= 0 {
se.eks = append(se.eks, ek)
return
}
lastKey := se.eks[len(se.eks)-1]
if lastKey.FileOffset+uint64(lastKey.Size) <= ek.FileOffset {
se.eks = append(se.eks, ek)
return
}
firstKey := se.eks[0]
if firstKey.FileOffset >= endOffset {
eks := se.doCopyExtents()
se.eks = se.eks[:0]
se.eks = append(se.eks, ek)
se.eks = append(se.eks, eks...)
return
}
var startIndex, endIndex int
invalidExtents := make([]proto.ExtentKey, 0)
for idx, key := range se.eks {
if ek.FileOffset > key.FileOffset {
startIndex = idx + 1
continue
}
if endOffset >= key.FileOffset+uint64(key.Size) {
invalidExtents = append(invalidExtents, key)
continue
}
break
}
endIndex = startIndex + len(invalidExtents)
upperExtents := make([]proto.ExtentKey, len(se.eks)-endIndex)
copy(upperExtents, se.eks[endIndex:])
se.eks = se.eks[:startIndex]
se.eks = append(se.eks, ek)
se.eks = append(se.eks, upperExtents...)
// check if ek and key are the same extent file with size extented
deleteExtents = make([]proto.ExtentKey, 0, len(invalidExtents))
for _, key := range invalidExtents {
if key.PartitionId != ek.PartitionId || key.ExtentId != ek.ExtentId {
deleteExtents = append(deleteExtents, key)
}
}
return
}
func storeEkSplit(mpId uint64, inodeID uint64, ekRef *sync.Map, ek *proto.ExtentKey) (id uint64) {
if ekRef == nil {
log.LogErrorf("[storeEkSplit] mpId [%v] inodeID %v ekRef nil", mpId, inodeID)
return
}
log.LogDebugf("[storeEkSplit] mpId [%v] inode[%v] dp [%v] extent id[%v] ek [%v]", mpId, inodeID, ek.PartitionId, ek.ExtentId, ek)
id = ek.PartitionId<<32 | ek.ExtentId
var v uint32
if val, ok := ekRef.Load(id); !ok {
if ek.IsSplit() {
log.LogErrorf("[storeEkSplit] mpId [%v]inode id[%v] ek [%v] already be set split", mpId, inodeID, ek)
}
v = 1
} else {
v = val.(uint32) + 1
}
ek.SetSplit(true)
ekRef.Store(id, v)
log.LogDebugf("[storeEkSplit] mpId [%v] inode[%v] dp [%v] extent id[%v].key %v, cnt %v", mpId, inodeID, ek.PartitionId, ek.ExtentId,
ek.PartitionId<<32|ek.ExtentId, v)
return
}
func (se *SortedExtents) SplitWithCheck(mpId uint64, inodeID uint64, ekSplit proto.ExtentKey, ekRef *sync.Map) (delExtents []proto.ExtentKey, status uint8) {
status = proto.OpOk
endOffset := ekSplit.FileOffset + uint64(ekSplit.Size)
log.LogDebugf("[SplitWithCheck] mpId [%v]. inode[%v] ekSplit ek [%v]", mpId, inodeID, ekSplit)
se.Lock()
defer se.Unlock()
if len(se.eks) <= 0 {
log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] eks empty cann't find ek [%v]", mpId, inodeID, ekSplit)
status = proto.OpArgMismatchErr
return
}
lastKey := se.eks[len(se.eks)-1]
if lastKey.FileOffset+uint64(lastKey.Size) <= ekSplit.FileOffset {
log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] eks do split not found", mpId, inodeID)
status = proto.OpArgMismatchErr
return
}
firstKey := se.eks[0]
if firstKey.FileOffset >= endOffset {
log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] eks do split not found", mpId, inodeID)
status = proto.OpArgMismatchErr
return
}
var startIndex int
for idx, key := range se.eks {
if ekSplit.FileOffset >= key.FileOffset {
startIndex = idx + 1
continue
}
if endOffset >= key.FileOffset+uint64(key.Size) {
continue
}
break
}
if startIndex == 0 {
status = proto.OpArgMismatchErr
log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] should have no valid extent request [%v]", mpId, inodeID, ekSplit)
return
}
key := &se.eks[startIndex-1]
if !storage.IsTinyExtent(key.ExtentId) && (key.PartitionId != ekSplit.PartitionId || key.ExtentId != ekSplit.ExtentId) {
status = proto.OpArgMismatchErr
log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] key found with mismatch extent info [%v] request [%v]", mpId, inodeID, key, ekSplit)
return
}
keySize := key.Size
key.AddModGen()
if !key.IsSplit() {
storeEkSplit(mpId, inodeID, ekRef, key)
}
if ekSplit.FileOffset+uint64(ekSplit.Size) > key.FileOffset+uint64(key.Size) {
status = proto.OpArgMismatchErr
log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] request [%v] out scope of exist key [%v]", mpId, inodeID, ekSplit, key)
return
}
// Makes the request idempotent, just in case client retries.
if ekSplit.IsEqual(key) {
log.LogWarnf("SplitWithCheck. mpId [%v] request key %v is a repeat request", mpId, key)
return
}
delKey := *key
delKey.ExtentOffset = key.ExtentOffset + (ekSplit.FileOffset - key.FileOffset)
delKey.Size = ekSplit.Size
storeEkSplit(mpId, inodeID, ekRef, &delKey)
if ekSplit.Size == 0 {
log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] delKey %v,key %v, eksplit %v", mpId, inodeID, delKey, key, ekSplit)
}
delKey.FileOffset = ekSplit.FileOffset
delExtents = append(delExtents, delKey)
log.LogDebugf("SplitWithCheck. mpId [%v] inode[%v] key offset %v, split FileOffset %v, startIndex %v,key [%v], ekSplit[%v] delkey [%v]", mpId, inodeID,
key.FileOffset, ekSplit.FileOffset, startIndex, key, ekSplit, delKey)
if key.FileOffset == ekSplit.FileOffset { // at the begin
keyDup := *key
eks := make([]proto.ExtentKey, len(se.eks)-startIndex)
copy(eks, se.eks[startIndex:])
se.eks = se.eks[:startIndex-1]
var keyBefore *proto.ExtentKey
if len(se.eks) > 0 {
keyBefore = &se.eks[len(se.eks)-1]
log.LogDebugf("SplitWithCheck. mpId [%v].keyBefore. ek [%v] and ekSplit %v", mpId, keyBefore, ekSplit)
}
if keyBefore != nil && keyBefore.IsSequenceWithSameSeq(&ekSplit) {
log.LogDebugf("SplitWithCheck. mpId [%v]. inode[%v] keyBefore [%v], ekSplit [%v]", mpId, inodeID, keyBefore, ekSplit)
log.LogDebugf("SplitWithCheck. mpId [%v].merge.head. ek [%v] and %v", mpId, keyBefore, ekSplit)
keyBefore.Size += ekSplit.Size
} else {
se.eks = append(se.eks, ekSplit)
storeEkSplit(mpId, inodeID, ekRef, &ekSplit)
}
keyDup.FileOffset = keyDup.FileOffset + uint64(ekSplit.Size)
keyDup.ExtentOffset = keyDup.ExtentOffset + uint64(ekSplit.Size)
keyDup.Size = keySize - ekSplit.Size
if keyDup.Size == 0 {
log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] delKey %v,keyDup %v, eksplit %v", mpId, inodeID, delKey, keyDup, ekSplit)
}
se.eks = append(se.eks, keyDup)
se.eks = append(se.eks, eks...)
} else if key.FileOffset+uint64(key.Size) == ekSplit.FileOffset+uint64(ekSplit.Size) { // in the end
key.Size = keySize - ekSplit.Size
if key.Size == 0 {
log.LogErrorf("SplitWithCheck. mpId [%v].inode[%v] delKey %v,key %v, eksplit %v", mpId, inodeID, delKey, key, ekSplit)
}
eks := make([]proto.ExtentKey, len(se.eks[startIndex:]))
copy(eks, se.eks[startIndex:])
se.eks = se.eks[:startIndex]
if len(eks) > 0 && ekSplit.IsSequenceWithSameSeq(&eks[0]) {
log.LogDebugf("SplitWithCheck.mpId [%v].merge.end. ek [%v] and %v", mpId, ekSplit, eks[0])
eks[0].FileOffset = ekSplit.FileOffset
eks[0].ExtentOffset = ekSplit.ExtentOffset
eks[0].Size += ekSplit.Size
} else {
se.eks = append(se.eks, ekSplit)
storeEkSplit(mpId, inodeID, ekRef, &ekSplit)
}
se.eks = append(se.eks, eks...)
} else { // in the middle
key.Size = uint32(ekSplit.FileOffset - key.FileOffset)
if key.Size == 0 {
log.LogErrorf("SplitWithCheck. mpId [%v].inode[%v] delKey %v,key %v, eksplit %v", mpId, inodeID, delKey, key, ekSplit)
}
eks := make([]proto.ExtentKey, len(se.eks[startIndex:]))
copy(eks, se.eks[startIndex:])
se.eks = se.eks[:startIndex]
se.eks = append(se.eks, ekSplit)
storeEkSplit(mpId, inodeID, ekRef, &ekSplit)
mKey := &proto.ExtentKey{
FileOffset: ekSplit.FileOffset + uint64(ekSplit.Size),
PartitionId: key.PartitionId,
ExtentId: key.ExtentId,
ExtentOffset: key.ExtentOffset + uint64(key.Size) + uint64(ekSplit.Size),
Size: keySize - key.Size - ekSplit.Size,
// crc
SnapInfo: &proto.ExtSnapInfo{
VerSeq: key.GetSeq(),
ModGen: 0,
IsSplit: true,
},
}
se.eks = append(se.eks, *mKey)
storeEkSplit(mpId, inodeID, ekRef, mKey)
if keySize-key.Size-ekSplit.Size == 0 {
log.LogErrorf("SplitWithCheck. mpId [%v].inode[%v] keySize %v,key %v, eksplit %v", mpId, inodeID, keySize, key, ekSplit)
}
se.eks = append(se.eks, eks...)
}
return
}
func (se *SortedExtents) CheckAndAddRef(lastKey *proto.ExtentKey, currEk *proto.ExtentKey, addRefFunc func(*proto.ExtentKey)) (ok bool) {
if !lastKey.IsSameExtent(currEk) {
return
}
log.LogDebugf("action[AppendWithCheck.CheckAndAddRef] ek [%v],lastKey %v", currEk, lastKey)
if lastKey.FileOffset+uint64(lastKey.Size) <= currEk.FileOffset {
if !lastKey.IsSplit() {
addRefFunc(lastKey)
}
addRefFunc(currEk)
ok = true
return
}
if lastKey.FileOffset == currEk.FileOffset &&
lastKey.PartitionId == currEk.PartitionId &&
lastKey.ExtentId == currEk.ExtentId &&
lastKey.ExtentOffset == currEk.ExtentOffset && lastKey.Size < currEk.Size && lastKey.GetSeq() < currEk.GetSeq() {
log.LogDebugf("action[AppendWithCheck.CheckAndAddRef] split append key %v", currEk)
currEk.FileOffset = lastKey.FileOffset + uint64(lastKey.Size)
currEk.ExtentOffset = currEk.ExtentOffset + uint64(lastKey.Size)
currEk.Size = currEk.Size - lastKey.Size
log.LogDebugf("action[AppendWithCheck.CheckAndAddRef] after split append key %v", currEk)
if !lastKey.IsSplit() {
addRefFunc(lastKey)
}
addRefFunc(currEk)
ok = true
return
}
return
}
func (se *SortedExtents) AppendWithCheck(inodeID uint64, ek proto.ExtentKey, addRefFunc func(*proto.ExtentKey), clientDiscardExts []proto.ExtentKey) (deleteExtents []proto.ExtentKey, status uint8) {
status = proto.OpOk
endOffset := ek.FileOffset + uint64(ek.Size)
se.Lock()
defer se.Unlock()
log.LogDebugf("action[AppendWithCheck] ek [%v], clientDiscardExts [%v] se.eks [%v]", ek, clientDiscardExts, se.eks)
if len(se.eks) <= 0 {
se.eks = append(se.eks, ek)
return
}
idx := len(se.eks) - 1
tailKey := &se.eks[idx]
log.LogDebugf("action[AppendWithCheck] ek [%v],tailKey %v, clientDiscardExts [%v] se.eks [%v]", ek, tailKey, clientDiscardExts, se.eks)
if ok := se.CheckAndAddRef(tailKey, &ek, addRefFunc); ok {
se.eks = append(se.eks, ek)
return
}
firstKey := se.eks[0]
if firstKey.FileOffset >= endOffset {
se.insert(ek, 0)
return
}
var startIndex, endIndex int
invalidExtents := make([]proto.ExtentKey, 0)
for idx, key := range se.eks {
if ek.FileOffset > key.FileOffset {
startIndex = idx + 1
continue
}
if endOffset >= key.FileOffset+uint64(key.Size) {
invalidExtents = append(invalidExtents, key)
continue
}
break
}
// Makes the request idempotent, just in case client retries.
if len(invalidExtents) == 1 && invalidExtents[0].Equals(&ek) {
log.LogDebugf("action[AppendWithCheck] ek [%v]", ek)
return
}
// check if ek and key are the same extent file with size extented
deleteExtents = make([]proto.ExtentKey, 0, len(invalidExtents))
for _, key := range invalidExtents {
if key.PartitionId != ek.PartitionId || key.ExtentId != ek.ExtentId || key.ExtentOffset != ek.ExtentOffset {
deleteExtents = append(deleteExtents, key)
}
}
log.LogDebugf("action[AppendWithCheck] invalidExtents(%v) deleteExtents(%v) discardExtents(%v)", invalidExtents, deleteExtents, clientDiscardExts)
if clientDiscardExts != nil {
if len(deleteExtents) != len(clientDiscardExts) {
log.LogErrorf("action[AppendWithCheck] OpConflictExtentsErr error. inode[%v] deleteExtents [%v] clientDiscardExts [%v]", inodeID, deleteExtents, clientDiscardExts)
return deleteExtents, proto.OpConflictExtentsErr
}
for i := 0; i < len(clientDiscardExts); i++ {
if deleteExtents[i].PartitionId != clientDiscardExts[i].PartitionId || deleteExtents[i].ExtentId != clientDiscardExts[i].ExtentId || deleteExtents[i].ExtentOffset != clientDiscardExts[i].ExtentOffset {
log.LogDebugf("action[AppendWithCheck] OpConflictExtentsErr error. inode[%v] idx %v deleteExtents[%v] clientDiscardExts [%v]", inodeID, i, deleteExtents[i], clientDiscardExts[i])
return deleteExtents, proto.OpConflictExtentsErr
}
}
} else if len(deleteExtents) != 0 {
log.LogDebugf("action[AppendWithCheck] OpConflictExtentsErr error. inode[%v] deleteExtents [%v]", inodeID, deleteExtents)
return deleteExtents, proto.OpConflictExtentsErr
}
defer func() {
if startIndex == 0 {
return
}
se.CheckAndAddRef(&se.eks[startIndex-1], &se.eks[startIndex], addRefFunc)
}()
if len(invalidExtents) == 0 {
se.insert(ek, startIndex)
return
}
endIndex = startIndex + len(invalidExtents)
se.instertWithDiscard(ek, startIndex, endIndex)
return
}
func (se *SortedExtents) Truncate(offset uint64, doOnLastKey func(*proto.ExtentKey), insertRefMap func(ek *proto.ExtentKey)) (deleteExtents []proto.ExtentKey) {
var endIndex int
se.Lock()
defer se.Unlock()
endIndex = -1
for idx, key := range se.eks {
if key.FileOffset >= offset {
endIndex = idx
break
}
}
if endIndex < 0 {
deleteExtents = make([]proto.ExtentKey, 0)
} else {
deleteExtents = make([]proto.ExtentKey, len(se.eks)-endIndex)
copy(deleteExtents, se.eks[endIndex:])
se.eks = se.eks[:endIndex]
}
numKeys := len(se.eks)
if numKeys > 0 {
lastKey := &se.eks[numKeys-1]
if lastKey.FileOffset+uint64(lastKey.Size) > offset {
if doOnLastKey != nil {
doOnLastKey(&proto.ExtentKey{Size: uint32(lastKey.FileOffset + uint64(lastKey.Size) - offset)})
}
rsKey := &proto.ExtentKey{}
*rsKey = *lastKey
lastKey.Size = uint32(offset - lastKey.FileOffset)
if insertRefMap != nil {
insertRefMap(lastKey)
}
rsKey.Size -= lastKey.Size
rsKey.FileOffset += uint64(lastKey.Size)
rsKey.ExtentOffset += uint64(lastKey.Size)
if insertRefMap != nil {
insertRefMap(rsKey)
}
deleteExtents = append([]proto.ExtentKey{*rsKey}, deleteExtents...)
log.LogDebugf("SortedExtents.Truncate rsKey %v, deleteExtents %v", rsKey, deleteExtents)
}
}
return
}
func (se *SortedExtents) insert(ek proto.ExtentKey, startIdx int) {
se.eks = append(se.eks, ek)
size := len(se.eks)
for idx := size - 1; idx > startIdx; idx-- {
se.eks[idx] = se.eks[idx-1]
}
se.eks[startIdx] = ek
}
func (se *SortedExtents) instertWithDiscard(ek proto.ExtentKey, startIdx, endIdx int) {
upperSize := len(se.eks) - endIdx
se.eks[startIdx] = ek
for idx := 0; idx < upperSize; idx++ {
se.eks[startIdx+1+idx] = se.eks[endIdx+idx]
}
se.eks = se.eks[:startIdx+1+upperSize]
}
func (se *SortedExtents) Len() int {
se.RLock()
defer se.RUnlock()
return len(se.eks)
}
// Returns the file size
func (se *SortedExtents) LayerSize() (layerSize uint64) {
se.RLock()
defer se.RUnlock()
last := len(se.eks)
if last <= 0 {
return uint64(0)
}
for _, ek := range se.eks {
layerSize += uint64(ek.Size)
}
return
}
// Returns the file size
func (se *SortedExtents) Size() uint64 {
se.RLock()
defer se.RUnlock()
last := len(se.eks)
if last <= 0 {
return uint64(0)
}
return se.eks[last-1].FileOffset + uint64(se.eks[last-1].Size)
}
func (se *SortedExtents) Range(f func(index int, ek proto.ExtentKey) bool) {
se.RLock()
defer se.RUnlock()
for i, ek := range se.eks {
if !f(i, ek) {
break
}
}
}
func (se *SortedExtents) Clone() *SortedExtents {
newSe := NewSortedExtents()
se.RLock()
defer se.RUnlock()
newSe.eks = se.doCopyExtents()
return newSe
}
func (se *SortedExtents) CopyExtents() []proto.ExtentKey {
se.RLock()
defer se.RUnlock()
return se.doCopyExtents()
}
func (se *SortedExtents) CopyTinyExtents() []proto.ExtentKey {
se.RLock()
defer se.RUnlock()
return se.doCopyTinyExtents()
}
func (se *SortedExtents) doCopyExtents() []proto.ExtentKey {
eks := make([]proto.ExtentKey, len(se.eks))
copy(eks, se.eks)
return eks
}
func (se *SortedExtents) doCopyTinyExtents() []proto.ExtentKey {
eks := make([]proto.ExtentKey, 0)
for _, ek := range se.eks {
if storage.IsTinyExtent(ek.ExtentId) {
eks = append(eks, ek)
}
}
return eks
}
// discard code
func (se *SortedExtents) Delete(delEks []proto.ExtentKey) (curEks []proto.ExtentKey) {
se.RLock()
defer se.RUnlock()
curEks = make([]proto.ExtentKey, len(se.eks)-len(delEks))
for _, key := range se.eks {
delFlag := false
for _, delKey := range delEks {
if key.FileOffset == delKey.ExtentOffset && key.ExtentId == delKey.ExtentId &&
key.ExtentOffset == delKey.ExtentOffset && key.PartitionId == delKey.PartitionId &&
key.Size == delKey.Size {
delFlag = true
break
}
}
if !delFlag {
curEks = append(curEks, key)
}
}
se.eks = curEks
return
}
package metanode
import (
"bytes"
"encoding/json"
"fmt"
"math"
"sync"
"github.com/cubefs/cubefs/proto"
)
type SortedObjExtents struct {
sync.RWMutex
eks []proto.ObjExtentKey
}
func NewSortedObjExtents() *SortedObjExtents {
return &SortedObjExtents{
eks: make([]proto.ObjExtentKey, 0),
}
}
func (se *SortedObjExtents) String() string {
se.RLock()
data, err := json.Marshal(se.eks)
se.RUnlock()
if err != nil {
return ""
}
return string(data)
}
func (se *SortedObjExtents) MarshalBinary() ([]byte, error) {
var data []byte
se.RLock()
defer se.RUnlock()
for _, ek := range se.eks {
ekdata, err := ek.MarshalBinary()
if err != nil {
return nil, err
}
data = append(data, ekdata...)
}
return data, nil
}
func (se *SortedObjExtents) UnmarshalBinary(data []byte) error {
var ek proto.ObjExtentKey
se.Lock()
defer se.Unlock()
buf := bytes.NewBuffer(data)
for {
if buf.Len() == 0 {
break
}
if err := ek.UnmarshalBinary(buf); err != nil {
return err
}
// Don't use se.Append here, since we need to retain the raw ek order.
se.eks = append(se.eks, ek)
}
return nil
}
// Append will return error if the objextentkey exist overlap.
func (se *SortedObjExtents) Append(ek proto.ObjExtentKey) (err error) {
se.Lock()
defer se.Unlock()
// 1. list is empty
if len(se.eks) <= 0 {
se.eks = append(se.eks, ek)
return
}
// 2. last key's (fileoffset+size) is equal to new one
lastKey := se.eks[len(se.eks)-1]
if (lastKey.FileOffset + lastKey.Size) == ek.FileOffset {
se.eks = append(se.eks, ek)
return
}
// fix: find one key is equals to the new one, if not, return error.
for i := len(se.eks) - 1; i >= 0; i-- {
if ek.IsEquals(&se.eks[i]) {
return
}
if se.eks[i].FileOffset < ek.FileOffset {
break
}
}
err = fmt.Errorf("obj extentkeys exist overlay! the new obj extent key must be appended to the last position with offset [%d], new(%s)",
lastKey.FileOffset, ek.String())
return
}
func (se *SortedObjExtents) Clone() *SortedObjExtents {
newSe := NewSortedObjExtents()
se.RLock()
defer se.RUnlock()
newSe.eks = se.doCopyExtents()
return newSe
}
func (se *SortedObjExtents) doCopyExtents() []proto.ObjExtentKey {
eks := make([]proto.ObjExtentKey, len(se.eks))
copy(eks, se.eks)
return eks
}
func (se *SortedObjExtents) CopyExtents() []proto.ObjExtentKey {
se.RLock()
defer se.RUnlock()
return se.doCopyExtents()
}
// Returns the file size
func (se *SortedObjExtents) Size() uint64 {
se.RLock()
defer se.RUnlock()
last := len(se.eks)
if last <= 0 {
return uint64(0)
}
// TODO: maybe we should use ebs location's Size?
return se.eks[last-1].FileOffset + se.eks[last-1].Size
}
func (se *SortedObjExtents) Range(f func(ek proto.ObjExtentKey) bool) {
se.RLock()
defer se.RUnlock()
for _, ek := range se.eks {
if !f(ek) {
break
}
}
}
func (se *SortedObjExtents) FindOffsetExist(fileOffset uint64) (bool, int) {
se.RLock()
defer se.RUnlock()
if len(se.eks) <= 0 {
return false, 0
}
left, right, mid := 0, len(se.eks)-1, 0
for {
mid = int(math.Floor(float64((left + right) / 2)))
if se.eks[mid].FileOffset > fileOffset {
right = mid - 1
} else if se.eks[mid].FileOffset < fileOffset {
left = mid + 1
} else {
return true, mid
}
if left > right {
break
}
}
return false, 0
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package metanode
import (
"bytes"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"net"
"strconv"
"strings"
"sync"
"time"
"golang.org/x/time/rate"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/log"
)
// Rollback Type
const (
TxNoOp uint8 = iota
TxUpdate
TxDelete
TxAdd
)
func (i *TxRollbackInode) ToString() string {
content := fmt.Sprintf("{inode:[ino:%v, type:%v, nlink:%v], quotaIds:%v, rbType:%v"+
"txInodeInfo:[Ino:%v, MpID:%v, CreateTime:%v, Timeout:%v, TxID:%v, MpMembers:%v]}",
i.inode.Inode, i.inode.Type, i.inode.NLink, i.quotaIds, i.rbType, i.txInodeInfo.Ino, i.txInodeInfo.MpID,
i.txInodeInfo.CreateTime, i.txInodeInfo.Timeout, i.txInodeInfo.TxID, i.txInodeInfo.MpMembers)
return content
}
type TxRollbackInode struct {
inode *Inode
txInodeInfo *proto.TxInodeInfo
rbType uint8 // Rollback Type
quotaIds []uint32
}
// Less tests whether the current TxRollbackInode item is less than the given one.
func (i *TxRollbackInode) Less(than btree.Item) bool {
ti, ok := than.(*TxRollbackInode)
if !ok {
return false
}
if i.txInodeInfo != nil && ti.txInodeInfo != nil {
return i.txInodeInfo.Ino < ti.txInodeInfo.Ino
}
return i.inode.Inode < ti.inode.Inode
}
// Copy returns a copy of the TxRollbackInode.
func (i *TxRollbackInode) Copy() btree.Item {
item := i.inode.Copy()
txInodeInfo := *i.txInodeInfo
quotaIds := make([]uint32, len(i.quotaIds))
copy(quotaIds, i.quotaIds)
return &TxRollbackInode{
inode: item.(*Inode),
quotaIds: quotaIds,
txInodeInfo: &txInodeInfo,
rbType: i.rbType,
}
}
func (i *TxRollbackInode) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0, 256))
bs, err := i.inode.Marshal()
if err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return
}
if _, err = buff.Write(bs); err != nil {
return
}
bs, err = i.txInodeInfo.Marshal()
if err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err = buff.Write(bs); err != nil {
return
}
if err = binary.Write(buff, binary.BigEndian, &i.rbType); err != nil {
return
}
quotaBytes := bytes.NewBuffer(make([]byte, 0, 8))
for _, quotaId := range i.quotaIds {
if err = binary.Write(quotaBytes, binary.BigEndian, quotaId); err != nil {
return
}
}
_, err = buff.Write(quotaBytes.Bytes())
return buff.Bytes(), err
}
func (i *TxRollbackInode) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
var dataLen uint32
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
ino := NewInode(0, 0)
if err = ino.Unmarshal(data); err != nil {
return
}
i.inode = ino
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data = make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txInodeInfo := proto.NewTxInodeInfo("", 0, 0)
if err = txInodeInfo.Unmarshal(data); err != nil {
return
}
i.txInodeInfo = txInodeInfo
if err = binary.Read(buff, binary.BigEndian, &i.rbType); err != nil {
return
}
var quotaId uint32
for {
if buff.Len() == 0 {
break
}
if err = binary.Read(buff, binary.BigEndian, "aId); err != nil {
return
}
i.quotaIds = append(i.quotaIds, quotaId)
}
return
}
func NewTxRollbackInode(inode *Inode, quotaIds []uint32, txInodeInfo *proto.TxInodeInfo, rbType uint8) *TxRollbackInode {
return &TxRollbackInode{
inode: inode,
quotaIds: quotaIds,
txInodeInfo: txInodeInfo,
rbType: rbType,
}
}
type TxRollbackDentry struct {
dentry *Dentry
txDentryInfo *proto.TxDentryInfo
rbType uint8 // Rollback Type `
}
func (d *TxRollbackDentry) ToString() string {
content := fmt.Sprintf("{dentry:[ParentId:%v, Name:%v, Inode:%v, Type:%v], rbType:%v, "+
"txDentryInfo:[ParentId:%v, Name:%v, MpMembers:%v, TxID:%v, MpID:%v, CreateTime:%v, Timeout:%v]}",
d.dentry.ParentId, d.dentry.Name, d.dentry.Inode, d.dentry.Type, d.rbType, d.txDentryInfo.ParentId, d.txDentryInfo.Name,
d.txDentryInfo.MpMembers, d.txDentryInfo.TxID, d.txDentryInfo.MpID, d.txDentryInfo.CreateTime, d.txDentryInfo.Timeout)
return content
}
// Less tests whether the current TxRollbackDentry item is less than the given one.
func (d *TxRollbackDentry) Less(than btree.Item) bool {
td, ok := than.(*TxRollbackDentry)
return ok && d.txDentryInfo.GetKey() < td.txDentryInfo.GetKey()
}
// Copy returns a copy of the TxRollbackDentry.
func (d *TxRollbackDentry) Copy() btree.Item {
item := d.dentry.Copy()
txDentryInfo := *d.txDentryInfo
return &TxRollbackDentry{
dentry: item.(*Dentry),
txDentryInfo: &txDentryInfo,
rbType: d.rbType,
}
}
func (d *TxRollbackDentry) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0, 512))
bs, err := d.dentry.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
log.LogDebugf("TxRollbackDentry Marshal dentry %v", d.dentry)
log.LogDebugf("TxRollbackDentry Marshal txDentryInfo %v", d.ToString())
bs, err = d.txDentryInfo.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &d.rbType); err != nil {
return
}
return buff.Bytes(), nil
}
func (d *TxRollbackDentry) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
var dataLen uint32
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
log.LogDebugf("TxRollbackDentry Unmarshal len %v", dataLen)
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
dentry := &Dentry{}
if err = dentry.Unmarshal(data); err != nil {
return
}
log.LogDebugf("TxRollbackDentry Unmarshal dentry %v", dentry)
d.dentry = dentry
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data = make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txDentryInfo := proto.NewTxDentryInfo("", 0, "", 0)
if err = txDentryInfo.Unmarshal(data); err != nil {
return
}
d.txDentryInfo = txDentryInfo
if err = binary.Read(buff, binary.BigEndian, &d.rbType); err != nil {
return
}
return
}
func NewTxRollbackDentry(dentry *Dentry, txDentryInfo *proto.TxDentryInfo, rbType uint8) *TxRollbackDentry {
return &TxRollbackDentry{
dentry: dentry,
txDentryInfo: txDentryInfo,
rbType: rbType,
}
}
// TM
type TransactionManager struct {
// need persistence and sync to all the raft members of the mp
txIdAlloc *TxIDAllocator
txTree *BTree
txProcessor *TransactionProcessor
blacklist *util.Set
opLimiter *rate.Limiter
sync.RWMutex
}
// RM
type TransactionResource struct {
txRbInodeTree *BTree // key: inode id
txRbDentryTree *BTree // key: parentId_name
txProcessor *TransactionProcessor
sync.RWMutex
}
type TransactionProcessor struct {
txManager *TransactionManager // TM
txResource *TransactionResource // RM
mp *metaPartition
mask proto.TxOpMask
}
func (p *TransactionProcessor) Reset() {
p.txManager.Reset()
p.txResource.Reset()
}
func (p *TransactionProcessor) Pause() bool {
return p.mask == proto.TxPause
}
func NewTransactionManager(txProcessor *TransactionProcessor) *TransactionManager {
txMgr := &TransactionManager{
txIdAlloc: newTxIDAllocator(),
txTree: NewBtree(),
txProcessor: txProcessor,
blacklist: util.NewSet(),
opLimiter: rate.NewLimiter(rate.Inf, 128),
}
return txMgr
}
func NewTransactionResource(txProcessor *TransactionProcessor) *TransactionResource {
txRsc := &TransactionResource{
txRbInodeTree: NewBtree(),
txRbDentryTree: NewBtree(),
txProcessor: txProcessor,
}
return txRsc
}
func NewTransactionProcessor(mp *metaPartition) *TransactionProcessor {
txProcessor := &TransactionProcessor{
mp: mp,
}
txProcessor.txManager = NewTransactionManager(txProcessor)
txProcessor.txResource = NewTransactionResource(txProcessor)
if mp.config != nil {
go txProcessor.txManager.processExpiredTransactions()
}
return txProcessor
}
func (tm *TransactionManager) setLimit(val int) string {
if val > 0 {
tm.opLimiter.SetLimit(rate.Limit(val))
return fmt.Sprintf("%v", val)
}
tm.opLimiter.SetLimit(rate.Inf)
return "unlimited"
}
func (tm *TransactionManager) Reset() {
tm.blacklist.Clear()
tm.Lock()
tm.txIdAlloc.Reset()
tm.txTree.Reset()
tm.opLimiter.SetLimit(0)
tm.Unlock()
}
var test = false
func (tm *TransactionManager) processExpiredTransactions() {
mpId := tm.txProcessor.mp.config.PartitionId
log.LogInfof("processExpiredTransactions for mp[%v] started", mpId)
clearInterval := time.Second * 60
clearTimer := time.NewTimer(clearInterval)
txCheckVal := time.Second * 3
txCheckTimer := time.NewTimer(txCheckVal)
defer func() {
log.LogWarnf("processExpiredTransactions for mp[%v] exit", mpId)
txCheckTimer.Stop()
clearTimer.Stop()
return
}()
for {
select {
case <-tm.txProcessor.mp.stopC:
log.LogDebugf("[processExpiredTransactions] deleteWorker stop partition: %v", mpId)
return
default:
}
if _, ok := tm.txProcessor.mp.IsLeader(); !ok && !test {
log.LogDebugf("processExpiredTransactions: not leader sleep 1s, mp %d", mpId)
time.Sleep(time.Second * 10)
continue
}
select {
case <-tm.txProcessor.mp.stopC:
log.LogWarnf("processExpiredTransactions for mp[%v] stopped", mpId)
return
case <-clearTimer.C:
tm.blacklist.Clear()
clearTimer.Reset(clearInterval)
log.LogDebugf("processExpiredTransactions: blacklist cleared, mp %d", mpId)
case <-txCheckTimer.C:
if tm.txProcessor.Pause() {
txCheckTimer.Reset(txCheckVal)
continue
}
tm.processTx()
txCheckTimer.Reset(txCheckVal)
}
}
}
func (tm *TransactionManager) processTx() {
mpId := tm.txProcessor.mp.config.PartitionId
start := time.Now()
log.LogDebugf("processTx: mp[%v] mask %v", mpId, proto.GetMaskString(tm.txProcessor.mask))
defer func() {
log.LogDebugf("processTx: mp %d total cost %s", mpId, time.Since(start).String())
}()
limitCh := make(chan struct{}, 32)
var wg sync.WaitGroup
get := func() {
wg.Add(1)
limitCh <- struct{}{}
}
put := func() {
<-limitCh
wg.Done()
}
idx := 0
f := func(i BtreeItem) bool {
idx++
if idx%100 == 0 {
if _, ok := tm.txProcessor.mp.IsLeader(); !ok {
log.LogWarnf("processExpiredTransactions for mp[%v] already not leader and break tx tree traverse",
tm.txProcessor.mp.config.PartitionId)
return false
}
}
tx := i.(*proto.TransactionInfo)
rollbackFunc := func(skipSetStat bool) {
defer put()
status, err := tm.rollbackTx(tx.TxID, skipSetStat)
if err != nil || status != proto.OpOk {
log.LogWarnf("processExpiredTransactions: transaction (%v) expired, rolling back failed, status(%v), err(%v)",
tx, status, err)
return
}
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) expired, rolling back done", tx)
}
}
commitFunc := func() {
defer put()
status, err := tm.commitTx(tx.TxID, true)
if err != nil || status != proto.OpOk {
log.LogWarnf("processExpiredTransactions: transaction (%v) expired, commit failed, status(%v), err(%v)",
tx, status, err)
return
}
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) expired, commit done", tx)
}
}
delFunc := func() {
defer put()
status, err := tm.delTxFromRM(tx.TxID)
if err != nil || status != proto.OpOk {
log.LogWarnf("processExpiredTransactions: delTxFromRM (%v) expired, commit failed, status(%v), err(%v)",
tx, status, err)
return
}
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) delTxFromRM, commit done", tx)
}
}
clearOrphan := func() {
defer put()
tm.clearOrphanTx(tx)
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) clearOrphanTx", tx)
}
}
if tx.TmID != int64(mpId) {
if tx.CanDelete() {
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) can be deleted", tx)
}
get()
go delFunc()
return true
}
if tx.NeedClearOrphan() {
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: orphan transaction (%v) can be clear", tx)
}
get()
go clearOrphan()
return true
}
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: RM transaction (%v) is ongoing", tx)
}
return true
}
if tx.State == proto.TxStateCommit {
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) continue to commit...", tx)
}
get()
go commitFunc()
return true
}
if tx.State == proto.TxStateRollback {
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) continue to roll back...", tx)
}
get()
go rollbackFunc(true)
return true
}
if tx.State == proto.TxStatePreCommit {
if !tx.IsExpired() {
return true
}
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) expired, rolling back...", tx)
}
get()
go rollbackFunc(false)
return true
}
if tx.IsDone() {
if !tx.CanDelete() {
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) is ongoing", tx)
}
return true
}
if log.EnableDebug() {
log.LogDebugf("processExpiredTransactions: transaction (%v) can be deleted", tx)
}
get()
go delFunc()
return true
}
log.LogCriticalf("processExpiredTransactions: transaction (%v) is in state failed", tx)
return true
}
tm.txTree.GetTree().Ascend(f)
wg.Wait()
}
func (tm *TransactionManager) nextTxID() string {
id := tm.txIdAlloc.allocateTransactionID()
txId := fmt.Sprintf("%d_%d", tm.txProcessor.mp.config.PartitionId, id)
log.LogDebugf("nextTxID: txId:%v", txId)
return txId
}
func (tm *TransactionManager) txInRMDone(txId string) bool {
ifo := tm.getTransaction(txId)
if ifo == nil || ifo.Finish() {
log.LogWarnf("txInRMDone: tx in rm already done, txId %s, ifo %v", txId, ifo)
return true
}
return false
}
func (tm *TransactionManager) getTransaction(txID string) (txInfo *proto.TransactionInfo) {
txItem := proto.NewTxInfoBItem(txID)
item := tm.txTree.Get(txItem)
if item == nil {
return nil
}
txInfo = item.(*proto.TransactionInfo)
return
}
func (tm *TransactionManager) copyGetTx(txId string) (txInfo *proto.TransactionInfo) {
txItem := proto.NewTxInfoBItem(txId)
item := tm.txTree.CopyGet(txItem)
if item == nil {
return nil
}
txInfo = item.(*proto.TransactionInfo)
return
}
func (tm *TransactionManager) updateTxIdCursor(txId string) (err error) {
arr := strings.Split(txId, "_")
if len(arr) != 2 {
return fmt.Errorf("updateTxId: tx[%v] is invalid", txId)
}
id, err := strconv.ParseUint(arr[1], 10, 64)
if err != nil {
return fmt.Errorf("updateTxId: tx[%v] is invalid", txId)
}
if id > tm.txIdAlloc.getTransactionID() {
tm.txIdAlloc.setTransactionID(id)
}
return nil
}
func (tm *TransactionManager) addTxInfo(txInfo *proto.TransactionInfo) {
tm.txTree.ReplaceOrInsert(txInfo, true)
}
// TM register a transaction, process client transaction
func (tm *TransactionManager) registerTransaction(txInfo *proto.TransactionInfo) (err error) {
if uint64(txInfo.TmID) == tm.txProcessor.mp.config.PartitionId {
if err := tm.updateTxIdCursor(txInfo.TxID); err != nil {
log.LogErrorf("updateTxIdCursor failed, txInfo %s, err %s", txInfo.String(), err.Error())
return err
}
for _, inode := range txInfo.TxInodeInfos {
inode.SetCreateTime(txInfo.CreateTime)
inode.SetTimeout(txInfo.Timeout)
inode.SetTxId(txInfo.TxID)
}
for _, dentry := range txInfo.TxDentryInfos {
dentry.SetCreateTime(txInfo.CreateTime)
dentry.SetTimeout(txInfo.Timeout)
dentry.SetTxId(txInfo.TxID)
}
}
if info := tm.getTransaction(txInfo.TxID); info != nil {
log.LogWarnf("tx is already exist, txId %s, info %v", txInfo.TxID, info.String())
return nil
}
tm.addTxInfo(txInfo)
if log.EnableDebug() {
log.LogDebugf("registerTransaction: txInfo(%v)", txInfo)
}
return
}
func (tm *TransactionManager) deleteTxInfo(txId string) (status uint8) {
tm.Lock()
defer tm.Unlock()
status = proto.OpOk
txItem := proto.NewTxInfoBItem(txId)
item := tm.txTree.Delete(txItem)
if log.EnableDebug() {
log.LogDebugf("deleteTxInfo: tx[%v] is deleted, item %v", txId, item)
}
return
}
func (tm *TransactionManager) rollbackTxInfo(txId string) (status uint8) {
tm.Lock()
defer tm.Unlock()
status = proto.OpOk
tx := tm.getTransaction(txId)
if tx == nil {
status = proto.OpTxInfoNotExistErr
log.LogWarnf("rollbackTxInfo: rollback tx[%v] failed, not found", txId)
return
}
tx.State = proto.TxStateRollbackDone
tx.DoneTime = time.Now().Unix()
log.LogDebugf("rollbackTxInfo: tx[%v] is rolled back", tx)
return
}
func (tm *TransactionManager) commitTxInfo(txId string) (status uint8, err error) {
tm.Lock()
defer tm.Unlock()
status = proto.OpOk
tx := tm.getTransaction(txId)
if tx == nil {
status = proto.OpTxInfoNotExistErr
err = fmt.Errorf("commitTxInfo: commit tx[%v] failed, not found", txId)
return
}
tx.State = proto.TxStateCommitDone
tx.DoneTime = time.Now().Unix()
log.LogDebugf("commitTxInfo: tx[%v] is committed", tx)
return
}
func buildTxPacket(data interface{}, mp uint64, op uint8) (pkt *proto.Packet, err error) {
pkt = proto.NewPacketReqID()
pkt.Opcode = op
pkt.PartitionID = mp
err = pkt.MarshalData(data)
if err != nil {
errInfo := fmt.Sprintf("buildTxPacket: marshal txInfo [%v] failed", data)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return nil, err
}
return
}
func (tm *TransactionManager) setTransactionState(txId string, state int32) (status uint8, err error) {
var val []byte
var resp interface{}
status = proto.OpOk
stateReq := &proto.TxSetStateRequest{
TxID: txId,
State: state,
}
val, _ = json.Marshal(stateReq)
resp, err = tm.txProcessor.mp.submit(opFSMTxSetState, val)
if err != nil {
log.LogWarnf("setTransactionState: set transaction[%v] state to [%v] failed, err[%v]", txId, state, err)
return proto.OpAgain, err
}
status = resp.(uint8)
if status != proto.OpOk {
errInfo := fmt.Sprintf("setTransactionState: set transaction[%v] state to [%v] failed", txId, state)
err = errors.New(errInfo)
log.LogWarnf("%v", errInfo)
}
return
}
func (tm *TransactionManager) delTxFromRM(txId string) (status uint8, err error) {
req := proto.TxApplyRequest{
TxID: txId,
}
val, err := json.Marshal(req)
if err != nil {
return
}
resp, err := tm.txProcessor.mp.submit(opFSMTxDelete, val)
if err != nil {
log.LogWarnf("delTxFromRM: delTxFromRM transaction[%v] failed, err[%v]", txId, err)
return proto.OpAgain, err
}
status = resp.(uint8)
if log.EnableDebug() {
log.LogDebugf("delTxFromRM: tx[%v] is deleted successfully, status (%s)", txId, proto.GetStatusStr(status))
}
return
}
func (tm *TransactionManager) clearOrphanTx(tx *proto.TransactionInfo) {
log.LogWarnf("clearOrphanTx: start to clearOrphanTx, tx %v", tx)
// check txInfo whether exist in tm
req := &proto.TxGetInfoRequest{
Pid: uint64(tx.TmID),
TxID: tx.TxID,
}
pkt, err := buildTxPacket(req, req.Pid, proto.OpMetaTxGet)
if err != nil {
return
}
mps := tx.GroupByMp()
tmpMp, ok := mps[req.Pid]
if !ok {
log.LogErrorf("clearOrphanTx: can't get tm Mp info from tx, tx %v", tx)
return
}
status := tm.txSendToMpWithAddrs(tmpMp.Members, pkt)
if status != proto.OpTxInfoNotExistErr {
log.LogWarnf("clearOrphanTx: tx is still exist, tx %v, status %s", tx, proto.GetStatusStr(status))
return
}
log.LogWarnf("clearOrphanTx: find tx in tm already not exist, start clear it from rm, tx %v", tx)
aReq := &proto.TxApplyRMRequest{
PartitionID: req.Pid,
TransactionInfo: tx,
}
newPkt := &Packet{}
err = tm.txProcessor.mp.TxRollbackRM(aReq, newPkt)
log.LogWarnf("clearOrphanTx: finally rollback tx in rm, tx %v, status %s, err %v",
tx, newPkt.GetResultMsg(), err)
return
}
func (tm *TransactionManager) commitTx(txId string, skipSetStat bool) (status uint8, err error) {
tx := tm.getTransaction(txId)
if tx == nil {
status = proto.OpTxInfoNotExistErr
log.LogWarnf("commitTx: tx[%v] not found, already success", txId)
return
}
if tx.State == proto.TxStateCommitDone {
status = proto.OpOk
log.LogWarnf("commitTx: tx[%v] is already commit", txId)
return
}
// 1.set transaction to TxStateCommit
if !skipSetStat && tx.State != proto.TxStateCommit {
status, err = tm.setTransactionState(txId, proto.TxStateCommit)
if status != proto.OpOk {
log.LogWarnf("commitTx: set transaction[%v] state to TxStateCommit failed", tx)
return
}
}
// 2. notify all related RMs that a transaction is completed
status = tm.sendToRM(tx, proto.OpTxCommitRM)
if status != proto.OpOk {
return
}
// 3. TM commit the transaction
req := proto.TxApplyRequest{
TxID: txId,
}
val, err := json.Marshal(req)
if err != nil {
return
}
resp, err := tm.txProcessor.mp.submit(opFSMTxCommit, val)
if err != nil {
log.LogWarnf("commitTx: commit transaction[%v] failed, err[%v]", txId, err)
return proto.OpAgain, err
}
status = resp.(uint8)
log.LogDebugf("commitTx: tx[%v] is commited successfully", txId)
return
}
func (tm *TransactionManager) sendToRM(txInfo *proto.TransactionInfo, op uint8) (status uint8) {
status = proto.OpOk
mpIfos := txInfo.GroupByMp()
statusCh := make(chan uint8, len(mpIfos))
wg := sync.WaitGroup{}
mp := tm.txProcessor.mp
for mpId, ifo := range mpIfos {
req := &proto.TxApplyRMRequest{
VolName: mp.config.VolName,
PartitionID: mpId,
TransactionInfo: txInfo,
}
wg.Add(1)
pkt, _ := buildTxPacket(req, mpId, op)
if mp.config.PartitionId == mpId {
pt := &Packet{*pkt}
go func() {
defer wg.Done()
var err error
if op == proto.OpTxCommitRM {
err = mp.TxCommitRM(req, pt)
} else {
err = mp.TxRollbackRM(req, pt)
}
statusCh <- pt.ResultCode
if pt.ResultCode != proto.OpOk {
log.LogWarnf("sendToRM: invoke TxCommitRM failed, ifo %v, pkt %s, err %v", txInfo, pt.GetResultMsg(), err)
}
}()
continue
}
members := ifo.Members
go func() {
defer wg.Done()
status := tm.txSendToMpWithAddrs(members, pkt)
if status != proto.OpOk {
log.LogWarnf("sendToRM: send to rm failed, addr %s, pkt %s, status %s",
members, string(pkt.Data), proto.GetStatusStr(status))
}
statusCh <- status
}()
}
wg.Wait()
close(statusCh)
updateStatus := func(st uint8) uint8 {
if st == proto.OpTxConflictErr || st == proto.OpTxInfoNotExistErr {
log.LogWarnf("sendToRM: might have already been committed, tx[%v], status (%s)", txInfo, proto.GetStatusStr(st))
return proto.OpOk
} else if st == proto.OpTxRbInodeNotExistErr || st == proto.OpTxRbDentryNotExistErr {
log.LogWarnf("sendToRM: already done before or not add, tx[%v], status (%s)", txInfo, proto.GetStatusStr(st))
return proto.OpOk
} else {
return st
}
}
for st := range statusCh {
t := updateStatus(st)
if t != proto.OpOk {
return t
}
}
return status
}
func (tm *TransactionManager) rollbackTx(txId string, skipSetStat bool) (status uint8, err error) {
status = proto.OpOk
tx := tm.getTransaction(txId)
if tx == nil {
log.LogWarnf("commitTx: tx[%v] not found, already success", txId)
return
}
if tx.State == proto.TxStateRollbackDone {
status = proto.OpOk
log.LogWarnf("commitTx: tx[%v] is already rollback", txId)
return
}
// 1.set transaction to TxStateRollback
if !skipSetStat && tx.State != proto.TxStateRollback {
status, err = tm.setTransactionState(txId, proto.TxStateRollback)
if status != proto.OpOk {
log.LogWarnf("commitTransaction: set transaction[%v] state to TxStateCommit failed", tx)
return
}
}
// 2. notify all related RMs that a transaction is completed
status = tm.sendToRM(tx, proto.OpTxRollbackRM)
if status != proto.OpOk {
return
}
req := proto.TxApplyRequest{
TxID: txId,
}
val, err := json.Marshal(req)
if err != nil {
return
}
resp, err := tm.txProcessor.mp.submit(opFSMTxRollback, val)
if err != nil {
log.LogWarnf("commitTx: rollback transaction[%v] failed, err[%v]", txId, err)
return proto.OpAgain, err
}
status = resp.(uint8)
log.LogDebugf("commitTx: tx[%v] is rollback successfully, msg %s", txId, proto.GetStatusStr(status))
return
}
func (tm *TransactionManager) sendPacketToMP(addr string, p *proto.Packet) (err error) {
var (
mConn *net.TCPConn
reqID = p.ReqID
reqOp = p.Opcode
)
connPool := tm.txProcessor.mp.manager.connPool
defer func() {
connPool.PutConnect(mConn, err != nil)
if err != nil {
p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
log.LogErrorf("[sendPacketToMP]: req: %d - %v, %v, packet(%v)", p.GetReqID(),
p.GetOpMsg(), err, p)
return
}
}()
mConn, err = connPool.GetConnect(addr)
if err != nil {
return
}
if err = p.WriteToConn(mConn); err != nil {
return
}
// read connection from the master
if err = p.ReadFromConn(mConn, proto.ReadDeadlineTime); err != nil {
return
}
if reqID != p.ReqID || reqOp != p.Opcode {
err = fmt.Errorf("sendPacketToMP: send and received packet mismatch: req(%v_%v) resp(%v_%v)",
reqID, reqOp, p.ReqID, p.Opcode)
return
}
if log.EnableDebug() {
log.LogDebugf("[sendPacketToMP] req: %d - %v, resp: %v, packet(%v)", p.GetReqID(), p.GetOpMsg(),
p.GetResultMsg(), p)
}
return
}
func (tm *TransactionManager) txSendToMpWithAddrs(addrStr string, p *proto.Packet) (status uint8) {
addrs := strings.Split(addrStr, ",")
var err error
skippedAddrs := make([]string, 0)
for _, addr := range addrs {
if tm.blacklist.Has(addr) {
log.LogWarnf("txSendToMpWithAddrs: addr[%v] is already blacklisted, retry another addr, p %s", addr, string(p.Data))
skippedAddrs = append(skippedAddrs, addr)
continue
}
newPkt := p.GetCopy()
err = tm.sendPacketToMP(addr, newPkt)
if err != nil {
tm.blacklist.Add(addr)
log.LogWarnf("txSendToMpWithAddrs: send to %v failed, err(%s), add to blacklist and retry another addr, p %s",
addr, err.Error(), string(p.Data))
continue
}
status := newPkt.ResultCode
if status == proto.OpErr || status == proto.OpAgain {
log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status(%s)",
addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
continue
}
if status == proto.OpOk {
if log.EnableDebug() {
log.LogDebugf("txSendToMpWithAddrs: send to %v done with status[%v], tx[%s]",
addr, status, string(p.Data))
}
err = nil
return status
}
log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status %s",
addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
return status
}
// try use skipped addr
for _, addr := range skippedAddrs {
newPkt := p.GetCopy()
err = tm.sendPacketToMP(addr, newPkt)
if err != nil {
log.LogWarnf("txSendToMpWithAddrs: send to %v failed, err(%s), add to blacklist and retry another addr, p %s",
addr, err.Error(), string(p.Data))
continue
}
status := newPkt.ResultCode
if status == proto.OpErr || status == proto.OpAgain {
log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status(%s)",
addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
continue
}
if status == proto.OpOk {
if log.EnableDebug() {
log.LogDebugf("txSendToMpWithAddrs: send to %v done with status[%v], tx[%s]",
addr, status, string(p.Data))
}
err = nil
return status
}
log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status %s",
addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
return status
}
log.LogWarnf("txSendToMpWithAddrs: after retry still failed, return opAgain, pkt %s, addrs %v, err %v, status %s",
string(p.Data), addrs, err, proto.GetStatusStr(status))
return proto.OpAgain
}
func (tm *TransactionManager) txSetState(req *proto.TxSetStateRequest) (status uint8, err error) {
tm.Lock()
defer tm.Unlock()
status = proto.OpOk
txItem := proto.NewTxInfoBItem(req.TxID)
item := tm.txTree.CopyGet(txItem)
if item == nil {
status = proto.OpTxInfoNotExistErr
errInfo := fmt.Sprintf("txSetState: set state failed, req[%v] tx not existed", req)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
txInfo := item.(*proto.TransactionInfo)
if req.State == proto.TxStateCommit && txInfo.State == proto.TxStateCommitDone {
log.LogWarnf("txSetState: tx is already success before set commit state, tx %v", txInfo)
status = proto.OpOk
return
}
if req.State < proto.TxStateCommit || req.State > proto.TxStateFailed {
status = proto.OpTxSetStateErr
errInfo := fmt.Sprintf("txSetState: set state failed, wrong state, req[%v]", req)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
if req.State == proto.TxStateCommit && txInfo.State != proto.TxStateCommit && txInfo.State != proto.TxStatePreCommit {
status = proto.OpTxSetStateErr
errInfo := fmt.Sprintf("txSetState: set state failed, wrong state, tx state[%v], req state[%v], tx[%v]",
txInfo.State, req.State, req.TxID)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
if req.State == proto.TxStateRollback && txInfo.State != proto.TxStateRollback && txInfo.State != proto.TxStatePreCommit {
status = proto.OpTxSetStateErr
errInfo := fmt.Sprintf("txSetState: set state failed, wrong state, tx state[%v], req state[%v], tx[%v]",
txInfo.State, req.State, req.TxID)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
log.LogDebugf("txSetState: set tx state from [%v] to [%v], tx[%v]", txInfo.State, req.State, req.TxID)
txInfo.State = req.State
return
}
func (tr *TransactionResource) Reset() {
tr.Lock()
defer tr.Unlock()
tr.txRbInodeTree.Reset()
tr.txRbDentryTree.Reset()
tr.txProcessor = nil
}
// check if item(inode, dentry) is in transaction for modifying
func (tr *TransactionResource) isInodeInTransction(ino *Inode) (inTx bool, txID string) {
// return true only if specified inode is in an ongoing transaction(not expired yet)
tr.Lock()
defer tr.Unlock()
if rbInode := tr.getTxRbInode(ino.Inode); rbInode != nil {
inTx = true
if rbInode.txInodeInfo != nil {
txID = rbInode.txInodeInfo.TxID
}
return
}
return false, ""
}
func (tr *TransactionResource) isDentryInTransction(dentry *Dentry) (inTx bool, txID string) {
tr.Lock()
defer tr.Unlock()
if rbDentry := tr.getTxRbDentry(dentry.ParentId, dentry.Name); rbDentry != nil {
inTx = true
if rbDentry.txDentryInfo != nil {
txID = rbDentry.txDentryInfo.TxID
}
return
}
return false, ""
}
func (tr *TransactionResource) getTxRbInode(ino uint64) (rbInode *TxRollbackInode) {
keyNode := &TxRollbackInode{
inode: NewInode(ino, 0),
}
item := tr.txRbInodeTree.Get(keyNode)
if item == nil {
return nil
}
rbInode = item.(*TxRollbackInode)
return
}
func (tr *TransactionResource) copyGetTxRbInode(ino uint64) (rbInode *TxRollbackInode) {
keyNode := &TxRollbackInode{
inode: NewInode(ino, 0),
}
item := tr.txRbInodeTree.CopyGet(keyNode)
if item == nil {
return nil
}
rbInode = item.(*TxRollbackInode)
return
}
func (tr *TransactionResource) deleteTxRollbackInode(ino uint64, txId string) (status uint8) {
tr.Lock()
defer tr.Unlock()
keyNode := &TxRollbackInode{
txInodeInfo: proto.NewTxInodeInfo("", ino, 0),
}
item := tr.txRbInodeTree.Get(keyNode)
if item == nil {
log.LogWarnf("deleteTxRollbackInode: rollback inode may be already been deleted, inode %d, txId %s",
ino, txId)
return proto.OpTxRbInodeNotExistErr
}
if item.(*TxRollbackInode).txInodeInfo.TxID != txId {
log.LogWarnf("deleteTxRollbackInode: rollback dentry is already been update by other, txId %s, item %v",
txId, item)
return proto.OpTxRbDentryNotExistErr
}
tr.txRbInodeTree.Delete(item)
return proto.OpOk
}
// RM add an `TxRollbackInode` into `txRollbackInodes`
func (tr *TransactionResource) addTxRollbackInode(rbInode *TxRollbackInode) (status uint8) {
tr.Lock()
defer tr.Unlock()
oldRbInode := tr.getTxRbInode(rbInode.inode.Inode)
if oldRbInode != nil {
if oldRbInode.txInodeInfo.TxID == rbInode.txInodeInfo.TxID {
log.LogWarnf("addTxRollbackInode: rollback inode [ino(%v) txID(%v)] is already exists",
rbInode.inode.Inode, rbInode.txInodeInfo.TxID)
return proto.OpExistErr
} else {
log.LogErrorf("addTxRollbackInode: rollback inode [ino(%v) txID(%v)] "+
"is conflicted with inode [ino(%v) txID(%v)]",
rbInode.inode.Inode, rbInode.txInodeInfo.TxID, oldRbInode.inode.Inode, oldRbInode.txInodeInfo.TxID)
return proto.OpTxConflictErr
}
}
tr.txRbInodeTree.ReplaceOrInsert(rbInode, true)
log.LogDebugf("addTxRollbackInode: rollback inode [ino(%v) txID(%v)] is added", rbInode.inode.Inode, rbInode.txInodeInfo.TxID)
return proto.OpOk
}
func (tr *TransactionResource) getTxRbDentry(pId uint64, name string) *TxRollbackDentry {
keyNode := &TxRollbackDentry{
txDentryInfo: proto.NewTxDentryInfo("", pId, name, 0),
}
item := tr.txRbDentryTree.Get(keyNode)
if item == nil {
return nil
}
return item.(*TxRollbackDentry)
}
func (tr *TransactionResource) deleteTxRollbackDentry(pid uint64, name, txId string) (status uint8) {
tr.Lock()
defer tr.Unlock()
keyNode := &TxRollbackDentry{
txDentryInfo: proto.NewTxDentryInfo("", pid, name, 0),
}
item := tr.txRbDentryTree.Get(keyNode)
if item == nil {
log.LogWarnf("deleteTxRollbackDentry: rollback dentry may be already been deleted, pid %d, name %s, txId %s",
pid, name, txId)
return proto.OpTxRbDentryNotExistErr
}
if item.(*TxRollbackDentry).txDentryInfo.TxID != txId {
log.LogWarnf("deleteTxRollbackDentry: rollback dentry is already been update by other, txId %s, item %v",
txId, name)
return proto.OpTxRbDentryNotExistErr
}
tr.txRbDentryTree.Delete(item)
return proto.OpOk
}
// RM add a `TxRollbackDentry` into `txRollbackDentries`
func (tr *TransactionResource) addTxRollbackDentry(rbDentry *TxRollbackDentry) (status uint8) {
tr.Lock()
defer tr.Unlock()
oldRbDentry := tr.getTxRbDentry(rbDentry.txDentryInfo.ParentId, rbDentry.dentry.Name)
if oldRbDentry != nil {
if oldRbDentry.txDentryInfo.TxID == rbDentry.txDentryInfo.TxID {
log.LogWarnf("addTxRollbackDentry: rollback dentry [pino(%v) name(%v) txID(%v)] is already exists",
rbDentry.dentry.ParentId, rbDentry.dentry.Name, rbDentry.txDentryInfo.TxID)
return proto.OpExistErr
}
log.LogWarnf("addTxRollbackDentry: rollback dentry [pino(%v) name(%v) txID(%v) rbType(%v)] "+
"is conflicted with dentry [pino(%v) name(%v) txID(%v) rbType(%v)]",
rbDentry.dentry.ParentId, rbDentry.dentry.Name, rbDentry.txDentryInfo.TxID, rbDentry.rbType,
oldRbDentry.dentry.ParentId, oldRbDentry.dentry.Name, oldRbDentry.txDentryInfo.TxID, oldRbDentry.rbType)
return proto.OpTxConflictErr
}
tr.txRbDentryTree.ReplaceOrInsert(rbDentry, true)
log.LogDebugf("addTxRollbackDentry: rollback dentry [pino(%v) name(%v) txID(%v) rbType(%v)] is added",
rbDentry.dentry.ParentId, rbDentry.dentry.Name, rbDentry.txDentryInfo.TxID, rbDentry.rbType)
return proto.OpOk
}
func (tr *TransactionResource) rollbackInodeInternal(rbInode *TxRollbackInode) (status uint8, err error) {
status = proto.OpOk
mp := tr.txProcessor.mp
switch rbInode.rbType {
case TxAdd:
var ino *Inode
item := mp.inodeTree.CopyGet(rbInode.inode)
if item != nil {
ino = item.(*Inode)
}
if item == nil || ino.IsTempFile() || ino.ShouldDelete() {
mp.freeList.Remove(rbInode.inode.Inode)
if mp.uidManager != nil {
mp.uidManager.addUidSpace(rbInode.inode.Uid, rbInode.inode.Inode, rbInode.inode.Extents.eks)
}
if mp.mqMgr != nil && len(rbInode.quotaIds) > 0 && item == nil {
mp.setInodeQuota(rbInode.quotaIds, rbInode.inode.Inode)
for _, quotaId := range rbInode.quotaIds {
mp.mqMgr.updateUsedInfo(int64(rbInode.inode.Size), 1, quotaId)
}
}
mp.inodeTree.ReplaceOrInsert(rbInode.inode, true)
} else {
ino.IncNLink(mp.verSeq)
}
case TxDelete:
if rsp := tr.txProcessor.mp.getInode(rbInode.inode, false); rsp.Status == proto.OpOk {
if tr.txProcessor.mp.uidManager != nil {
tr.txProcessor.mp.uidManager.doMinusUidSpace(rbInode.inode.Uid, rbInode.inode.Inode, rbInode.inode.Size)
}
if tr.txProcessor.mp.mqMgr != nil && len(rbInode.quotaIds) > 0 {
for _, quotaId := range rbInode.quotaIds {
tr.txProcessor.mp.mqMgr.updateUsedInfo(-1*int64(rbInode.inode.Size), -1, quotaId)
}
}
tr.txProcessor.mp.fsmUnlinkInode(rbInode.inode, 0)
tr.txProcessor.mp.fsmEvictInode(rbInode.inode)
}
default:
status = proto.OpTxRollbackUnknownRbType
err = fmt.Errorf("rollbackInode: unknown rbType %d", rbInode.rbType)
return
}
tr.txRbInodeTree.Delete(rbInode)
return
}
// RM roll back an inode, retry if error occours
func (tr *TransactionResource) rollbackInode(req *proto.TxInodeApplyRequest) (status uint8, err error) {
tr.Lock()
defer tr.Unlock()
status = proto.OpOk
rbInode := tr.getTxRbInode(req.Inode)
if rbInode == nil {
status = proto.OpTxRbInodeNotExistErr
errInfo := fmt.Sprintf("rollbackInode: roll back inode[%v] failed, txID[%v], rb inode not found", req.Inode, req.TxID)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
if rbInode.txInodeInfo.TxID != req.TxID {
status = proto.OpTxConflictErr
errInfo := fmt.Sprintf("rollbackInode: txID %v is not matching txInodeInfo txID %v", req.TxID, rbInode.txInodeInfo.TxID)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
status, err = tr.rollbackInodeInternal(rbInode)
if err != nil {
log.LogErrorf("rollbackInode: inode[%v] roll back failed in tx[%v], rbType[%v]", req.Inode, req.TxID, rbInode.rbType)
} else {
log.LogDebugf("rollbackInode: inode[%v] is rolled back in tx[%v], rbType[%v]", req.Inode, req.TxID, rbInode.rbType)
}
return
}
func (tr *TransactionResource) rollbackDentryInternal(rbDentry *TxRollbackDentry) (status uint8, err error) {
defer func() {
if status != proto.OpOk {
log.LogErrorf("rollbackDentryInternal: rollback dentry failed, ifo %v", rbDentry.txDentryInfo)
}
}()
status = proto.OpOk
switch rbDentry.rbType {
case TxAdd:
// need to be true to assert link not change.
status = tr.txProcessor.mp.fsmCreateDentry(rbDentry.dentry, true)
case TxDelete:
resp := tr.txProcessor.mp.fsmDeleteDentry(rbDentry.dentry, true)
status = resp.Status
case TxUpdate:
resp := tr.txProcessor.mp.fsmUpdateDentry(rbDentry.dentry)
status = resp.Status
default:
status = proto.OpTxRollbackUnknownRbType
err = fmt.Errorf("rollbackDentry: unknown rbType %d", rbDentry.rbType)
return
}
tr.txRbDentryTree.Delete(rbDentry)
return
}
// RM roll back a dentry, retry if error occours
func (tr *TransactionResource) rollbackDentry(req *proto.TxDentryApplyRequest) (status uint8, err error) {
tr.Lock()
defer tr.Unlock()
status = proto.OpOk
rbDentry := tr.getTxRbDentry(req.Pid, req.Name)
if rbDentry == nil {
status = proto.OpTxRbDentryNotExistErr
errInfo := fmt.Sprintf("rollbackDentry: roll back dentry[%v_%v] failed, rb inode not found, txID[%v]",
req.Pid, req.Name, req.TxID)
err = errors.New(errInfo)
log.LogWarnf("%v", errInfo)
return
}
if rbDentry.txDentryInfo.TxID != req.TxID {
status = proto.OpTxConflictErr
errInfo := fmt.Sprintf("rollbackDentry: txID %v is not matching txInodeInfo txID %v", req.TxID, rbDentry.txDentryInfo.TxID)
err = errors.New(errInfo)
log.LogWarnf("%v", errInfo)
return
}
status, err = tr.rollbackDentryInternal(rbDentry)
if err != nil {
log.LogErrorf("rollbackDentry: denKey[%v] roll back failed in tx[%v], rbType[%v]",
rbDentry.txDentryInfo.GetKey(), req.TxID, rbDentry.rbType)
} else {
log.LogDebugf("rollbackDentry: denKey[%v] is rolled back in tx[%v], rbType[%v]",
rbDentry.txDentryInfo.GetKey(), req.TxID, rbDentry.rbType)
}
return
}
// RM simplely remove the inode from TransactionResource
func (tr *TransactionResource) commitInode(txID string, inode uint64) (status uint8, err error) {
tr.Lock()
defer tr.Unlock()
status = proto.OpOk
rbInode := tr.getTxRbInode(inode)
if rbInode == nil {
status = proto.OpTxRbInodeNotExistErr
errInfo := fmt.Sprintf("commitInode: commit inode[%v] failed, rb inode not found", inode)
err = errors.New(errInfo)
log.LogWarnf("%v", errInfo)
return
}
if rbInode.txInodeInfo.TxID != txID {
status = proto.OpTxConflictErr
errInfo := fmt.Sprintf("commitInode: txID %v is not matching txInodeInfo txID %v", txID, rbInode.txInodeInfo.TxID)
err = errors.New(errInfo)
log.LogErrorf("%v", errInfo)
return
}
tr.txRbInodeTree.Delete(rbInode)
log.LogDebugf("commitInode: inode[%v] is committed", inode)
return
}
// RM simplely remove the dentry from TransactionResource
func (tr *TransactionResource) commitDentry(txID string, pId uint64, name string) (status uint8, err error) {
tr.Lock()
defer tr.Unlock()
status = proto.OpOk
rbDentry := tr.getTxRbDentry(pId, name)
if rbDentry == nil {
status = proto.OpTxRbDentryNotExistErr
errInfo := fmt.Sprintf("commitDentry: commit dentry[%v_%v] failed, rb dentry not found", pId, name)
err = errors.New(errInfo)
log.LogWarnf("%v", errInfo)
return
}
if rbDentry.txDentryInfo.TxID != txID {
status = proto.OpTxConflictErr
errInfo := fmt.Sprintf("commitDentry: txID %v is not matching txDentryInfo txID %v", txID, rbDentry.txDentryInfo.TxID)
err = errors.New(errInfo)
log.LogWarnf("%v", errInfo)
return
}
tr.txRbDentryTree.Delete(rbDentry)
// unlink parent inode
if rbDentry.rbType == TxAdd {
parInode := NewInode(pId, 0)
st := tr.txProcessor.mp.fsmUnlinkInode(parInode, 0)
if st.Status != proto.OpOk {
log.LogWarnf("commitDentry: try unlink parent inode failed, txId %s, inode[%v]", txID, parInode)
return
}
}
log.LogDebugf("commitDentry: dentry[%v] is committed", rbDentry.txDentryInfo.GetKey())
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package metanode
import (
"sync"
"sync/atomic"
)
// TxIDAllocator generates and allocates ids
type TxIDAllocator struct {
mpTxID uint64
txIDLock sync.RWMutex
}
// func newTxIDAllocator(mpID uint64, partition raftstore.Partition) (alloc *TxIDAllocator) {
func newTxIDAllocator() (alloc *TxIDAllocator) {
alloc = new(TxIDAllocator)
return
}
func (alloc *TxIDAllocator) Reset() {
atomic.StoreUint64(&alloc.mpTxID, 0)
}
func (alloc *TxIDAllocator) setTransactionID(id uint64) {
atomic.StoreUint64(&alloc.mpTxID, id)
}
func (alloc *TxIDAllocator) getTransactionID() uint64 {
return atomic.LoadUint64(&alloc.mpTxID)
}
func (alloc *TxIDAllocator) allocateTransactionID() (mpTxID uint64) {
alloc.txIDLock.Lock()
defer alloc.txIDLock.Unlock()
mpTxID = atomic.LoadUint64(&alloc.mpTxID) + 1
alloc.setTransactionID(mpTxID)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package metanode
import (
"bytes"
"encoding/binary"
"hash/crc32"
"sync"
"time"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/timeutil"
)
const (
checkerVersionSize = 4
CrcUint32Size = 4
checkerVersion = 1
checkerRecordV1Len = 16
opKeepTime = 300
opKeepOps = 1024
opRebuildSec = 86400
opCheckerInterval = time.Second * 10
opCheckerSliceCap = 1024
)
type uniqOp struct {
uniqid uint64
atime int64
}
type uniqChecker struct {
sync.Mutex
op map[uint64]struct{}
inQue *uniqOpQueue
rtime int64
keepTime int64
keepOps int
}
func newUniqChecker() *uniqChecker {
return &uniqChecker{
op: make(map[uint64]struct{}),
inQue: newUniqOpQueue(),
keepTime: opKeepTime,
keepOps: opKeepOps,
rtime: timeutil.GetCurrentTimeUnix(),
}
}
func (checker *uniqChecker) clone() *uniqChecker {
checker.Lock()
inQue := checker.inQue.clone()
checker.Unlock()
return &uniqChecker{inQue: inQue}
}
func (checker *uniqChecker) Marshal() (buf []byte, crc uint32, err error) {
buffer := bytes.NewBuffer(make([]byte, 0, checkerVersionSize+checker.inQue.len()*checkerRecordV1Len))
if err = binary.Write(buffer, binary.BigEndian, int32(checkerVersion)); err != nil {
return
}
checker.inQue.scan(func(op *uniqOp) bool {
if err = binary.Write(buffer, binary.BigEndian, op.uniqid); err != nil {
return false
}
if err = binary.Write(buffer, binary.BigEndian, op.atime); err != nil {
return false
}
return true
})
sign := crc32.NewIEEE()
if _, err = sign.Write(buffer.Bytes()); err != nil {
return
}
crc = sign.Sum32()
buf = buffer.Bytes()
return
}
func (checker *uniqChecker) UnMarshal(data []byte) (err error) {
if len(data) < checkerVersionSize {
err = errors.New("invalid uniqChecker file length")
log.LogErrorf("uniqChecker UnMarshal err(%v)", err)
return
}
buff := bytes.NewBuffer(data)
var version int32
if err = binary.Read(buff, binary.BigEndian, &version); err != nil {
log.LogErrorf("uniqChecker unmarshal read version err(%v)", err)
return
}
var uniqid uint64
var atime int64
now := time.Now().Unix()
for buff.Len() != 0 {
if err = binary.Read(buff, binary.BigEndian, &uniqid); err != nil {
log.LogErrorf("uniqChecker unmarshal read uniqid err(%v)", err)
return
}
if err = binary.Read(buff, binary.BigEndian, &atime); err != nil {
log.LogErrorf("uniqChecker unmarshal read atime err(%v)", err)
return
}
// atime over local time is too large
if atime > now+86400 {
log.LogWarnf("uniqChecker skip invalid atime %v uniqid %v", atime, uniqid)
continue
}
checker.inQue.append(&uniqOp{uniqid, atime})
checker.op[uniqid] = struct{}{}
}
return
}
func (checker *uniqChecker) legalIn(bid uint64) bool {
// ignore zero uniqid
if bid == 0 {
return true
}
checker.Lock()
defer checker.Unlock()
if _, ok := checker.op[bid]; ok {
return false
} else {
checker.op[bid] = struct{}{}
checker.inQue.append(&uniqOp{bid, time.Now().Unix()})
}
return true
}
func (checker *uniqChecker) evictIndex() (left int, idx int, op *uniqOp) {
checker.Lock()
defer checker.Unlock()
inQueCnt := checker.inQue.len()
if inQueCnt <= checker.keepOps {
return inQueCnt, -1, nil
}
var c int
var lastOp *uniqOp
nowtime := time.Now().Unix()
checker.inQue.scan(func(op *uniqOp) bool {
kt := checker.keepTime
if inQueCnt-c <= checker.keepOps {
kt = 10 * checker.keepTime
}
if nowtime-op.atime >= kt {
lastOp = op
c++
if c%10000 == 0 {
checker.Unlock()
time.Sleep(100 * time.Microsecond)
checker.Lock()
}
return true
}
return false
})
return inQueCnt - c, c - 1, lastOp
}
func (checker *uniqChecker) doEvict(evictBid uint64) {
checker.Lock()
defer checker.Unlock()
cnt := 0
// evict from map
if _, ok := checker.op[evictBid]; ok {
checker.inQue.scan(func(op *uniqOp) bool {
cnt++
delete(checker.op, op.uniqid)
if op.uniqid == evictBid {
return false
}
return true
})
}
if cnt == 0 {
return
}
// truncate from queue
checker.inQue.truncate(cnt - 1)
// regular rebuild map to reduce memory usage
n := timeutil.GetCurrentTimeUnix()
if n-checker.rtime > opRebuildSec {
checker.op = make(map[uint64]struct{}, checker.inQue.len())
checker.inQue.scan(func(op *uniqOp) bool {
checker.op[op.uniqid] = struct{}{}
return true
})
checker.rtime = n
}
}
type uniqOpSlice struct {
s []*uniqOp
}
// uniqOpQueue append only queue, item in queue should not be modified
type uniqOpQueue struct {
cnt int
ss []*uniqOpSlice
cur *uniqOpSlice
}
func newUniqOpQueue() *uniqOpQueue {
s := &uniqOpSlice{s: make([]*uniqOp, 0, opCheckerSliceCap)}
return &uniqOpQueue{
cnt: 0,
ss: []*uniqOpSlice{s},
cur: s,
}
}
func (b *uniqOpQueue) append(v *uniqOp) {
if cap(b.cur.s)-len(b.cur.s) == 0 {
b.cur = &uniqOpSlice{s: make([]*uniqOp, 0, opCheckerSliceCap)}
b.ss = append(b.ss, b.cur)
}
b.cur.s = append(b.cur.s, v)
b.cnt++
}
func (b *uniqOpQueue) index(idx int) *uniqOp {
for _, s := range b.ss {
l := len(s.s)
if idx >= l {
idx = idx - l
} else {
return s.s[idx]
}
}
return nil
}
func (b *uniqOpQueue) truncate(idx int) {
if idx >= b.cnt-1 {
b.reset()
return
}
b.cnt = b.cnt - idx - 1
var tidx int
var s *uniqOpSlice
for tidx, s = range b.ss {
l := len(s.s)
if idx >= l {
idx = idx - l
} else {
b.ss[tidx].s = s.s[idx+1:]
break
}
}
b.ss = b.ss[tidx:]
}
func (b *uniqOpQueue) scan(fn func(op *uniqOp) bool) {
for _, s := range b.ss {
for _, op := range s.s {
if !fn(op) {
return
}
}
}
}
func (b *uniqOpQueue) len() int {
return b.cnt
}
func (b *uniqOpQueue) reset() {
b.cur = &uniqOpSlice{s: make([]*uniqOp, 0, opCheckerSliceCap)}
b.ss = []*uniqOpSlice{b.cur}
b.cnt = 0
}
func (b *uniqOpQueue) clone() *uniqOpQueue {
ss := make([]*uniqOpSlice, 0, len(b.ss))
for _, s := range b.ss {
ss = append(ss, &uniqOpSlice{s.s[:]})
}
return &uniqOpQueue{
cnt: b.cnt,
ss: ss,
cur: ss[len(ss)-1],
}
}
package metanode
import (
"fmt"
"os"
"sort"
"strconv"
"strings"
)
type DelExtFile []os.FileInfo
func (del DelExtFile) Len() int {
return len(del)
}
func (del DelExtFile) Swap(i, j int) {
del[i], del[j] = del[j], del[i]
}
func (del DelExtFile) Less(i, j int) bool {
idx1 := getDelExtFileIdx(del[i].Name())
idx2 := getDelExtFileIdx(del[j].Name())
return idx1 < idx2
}
func getDelExtFileIdx(name string) int64 {
arr := strings.Split(name, "_")
size := len(arr)
if size < 2 {
panic(fmt.Errorf("file name is not legal, %s", name))
}
idx, err := strconv.ParseInt(arr[size-1], 10, 64)
if err != nil {
panic(fmt.Errorf("file name is not legal, %s", name))
}
return idx
}
func sortDelExtFileInfo(files []os.FileInfo) []os.FileInfo {
newFiles := make([]os.FileInfo, 0)
for _, info := range files {
if info.IsDir() {
continue
}
if strings.HasPrefix(info.Name(), prefixDelExtent) {
newFiles = append(newFiles, info)
}
}
if len(newFiles) <= 1 {
return newFiles
}
sort.Sort(DelExtFile(newFiles))
return newFiles
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"encoding/json"
"fmt"
"strconv"
"time"
"github.com/cubefs/cubefs/util"
)
// api
const (
// Admin APIs
AdminGetMasterApiList = "/admin/getMasterApiList"
AdminSetApiQpsLimit = "/admin/setApiQpsLimit"
AdminGetApiQpsLimit = "/admin/getApiQpsLimit"
AdminRemoveApiQpsLimit = "/admin/rmApiQpsLimit"
AdminGetCluster = "/admin/getCluster"
AdminSetClusterInfo = "/admin/setClusterInfo"
AdminGetMonitorPushAddr = "/admin/getMonitorPushAddr"
AdminGetDataPartition = "/dataPartition/get"
AdminLoadDataPartition = "/dataPartition/load"
AdminCreateDataPartition = "/dataPartition/create"
AdminCreatePreLoadDataPartition = "/dataPartition/createPreLoad"
AdminDecommissionDataPartition = "/dataPartition/decommission"
AdminDiagnoseDataPartition = "/dataPartition/diagnose"
AdminResetDataPartitionDecommissionStatus = "/dataPartition/resetDecommissionStatus"
AdminQueryDataPartitionDecommissionStatus = "/dataPartition/queryDecommissionStatus"
AdminDeleteDataReplica = "/dataReplica/delete"
AdminAddDataReplica = "/dataReplica/add"
AdminDeleteVol = "/vol/delete"
AdminUpdateVol = "/vol/update"
AdminVolShrink = "/vol/shrink"
AdminVolExpand = "/vol/expand"
AdminVolForbidden = "/vol/forbidden"
AdminVolEnableAuditLog = "/vol/auditlog"
AdminCreateVol = "/admin/createVol"
AdminGetVol = "/admin/getVol"
AdminClusterFreeze = "/cluster/freeze"
AdminClusterForbidMpDecommission = "/cluster/forbidMetaPartitionDecommission"
AdminClusterStat = "/cluster/stat"
AdminSetCheckDataReplicasEnable = "/cluster/setCheckDataReplicasEnable"
AdminGetIP = "/admin/getIp"
AdminCreateMetaPartition = "/metaPartition/create"
AdminSetMetaNodeThreshold = "/threshold/set"
AdminListVols = "/vol/list"
AdminSetNodeInfo = "/admin/setNodeInfo"
AdminGetNodeInfo = "/admin/getNodeInfo"
AdminGetAllNodeSetGrpInfo = "/admin/getDomainInfo"
AdminGetNodeSetGrpInfo = "/admin/getDomainNodeSetGrpInfo"
AdminGetIsDomainOn = "/admin/getIsDomainOn"
AdminUpdateNodeSetCapcity = "/admin/updateNodeSetCapcity"
AdminUpdateNodeSetId = "/admin/updateNodeSetId"
AdminUpdateNodeSetNodeSelector = "/admin/updateNodeSetNodeSelector"
AdminUpdateDomainDataUseRatio = "/admin/updateDomainDataRatio"
AdminUpdateZoneExcludeRatio = "/admin/updateZoneExcludeRatio"
AdminSetNodeRdOnly = "/admin/setNodeRdOnly"
AdminSetDpRdOnly = "/admin/setDpRdOnly"
AdminSetConfig = "/admin/setConfig"
AdminGetConfig = "/admin/getConfig"
AdminDataPartitionChangeLeader = "/dataPartition/changeleader"
AdminChangeMasterLeader = "/master/changeleader"
AdminOpFollowerPartitionsRead = "/master/opFollowerPartitionRead"
AdminUpdateDecommissionLimit = "/admin/updateDecommissionLimit"
AdminQueryDecommissionLimit = "/admin/queryDecommissionLimit"
// #nosec G101
AdminQueryDecommissionToken = "/admin/queryDecommissionToken"
AdminSetFileStats = "/admin/setFileStatsEnable"
AdminGetFileStats = "/admin/getFileStatsEnable"
AdminGetClusterValue = "/admin/getClusterValue"
AdminSetClusterUuidEnable = "/admin/setClusterUuidEnable"
AdminGetClusterUuid = "/admin/getClusterUuid"
AdminGenerateClusterUuid = "/admin/generateClusterUuid"
AdminSetDpDiscard = "/admin/setDpDiscard"
AdminGetDiscardDp = "/admin/getDiscardDp"
AdminSetConLcNodeNum = "/admin/setConLcNodeNum"
AdminGetAllLcNodeInfo = "/admin/getAllLcNodeInfo"
AdminLcNode = "/admin/lcnode"
AdminUpdateDecommissionDiskFactor = "/admin/updateDecommissionDiskFactor"
AdminQueryDecommissionDiskLimit = "/admin/queryDecommissionDiskLimit"
AdminEnableAutoDecommissionDisk = "/admin/enableAutoDecommissionDisk"
AdminQueryAutoDecommissionDisk = "/admin/queryAutoDecommissionDisk"
// graphql master api
AdminClusterAPI = "/api/cluster"
AdminUserAPI = "/api/user"
AdminVolumeAPI = "/api/volume"
// graphql coonsole api
ConsoleIQL = "/iql"
ConsoleLoginAPI = "/login"
ConsoleMonitorAPI = "/cfs_monitor"
ConsoleFile = "/file"
ConsoleFileDown = "/file/down"
ConsoleFileUpload = "/file/upload"
// Client APIs
ClientDataPartitions = "/client/partitions"
ClientVol = "/client/vol"
ClientMetaPartition = "/metaPartition/get"
ClientVolStat = "/client/volStat"
ClientMetaPartitions = "/client/metaPartitions"
// qos api
QosGetStatus = "/qos/getStatus"
QosGetClientsLimitInfo = "/qos/getClientsInfo"
QosGetZoneLimitInfo = "/qos/getZoneLimit" // include disk enable
QosUpdate = "/qos/update" // include disk enable
QosUpdateMagnify = "/qos/updateMagnify"
QosUpdateClientParam = "/qos/updateClientParam"
QosUpdateZoneLimit = "/qos/updateZoneLimit" // include disk enable
QosUpload = "/admin/qosUpload"
QosUpdateMasterLimit = "/qos/masterLimit"
// acl api
AdminACL = "/admin/aclOp"
// uid api
AdminUid = "/admin/uidOp"
// raft node APIs
AddRaftNode = "/raftNode/add"
RemoveRaftNode = "/raftNode/remove"
RaftStatus = "/get/raftStatus"
// node APIs
AddDataNode = "/dataNode/add"
DecommissionDataNode = "/dataNode/decommission"
QueryDataNodeDecoProgress = "/dataNode/queryDecommissionProgress"
QueryDataNodeDecoFailedDps = "/dataNode/queryDecommissionFailedDps"
MigrateDataNode = "/dataNode/migrate"
CancelDecommissionDataNode = "/dataNode/cancelDecommission"
DecommissionDisk = "/disk/decommission"
RecommissionDisk = "/disk/recommission"
QueryDiskDecoProgress = "/disk/queryDecommissionProgress"
MarkDecoDiskFixed = "/disk/MarkDecommissionDiskFixed"
CancelDecommissionDisk = "/disk/cancelDecommission"
QueryDecommissionDiskDecoFailedDps = "/disk/queryDecommissionFailedDps"
QueryBadDisks = "/disk/queryBadDisks"
RestoreStoppedAutoDecommissionDisk = "/disk/restoreStoppedAutoDecommissionDisk"
QueryAllDecommissionDisk = "/disk/queryAllDecommissionDisk"
GetDataNode = "/dataNode/get"
AddMetaNode = "/metaNode/add"
DecommissionMetaNode = "/metaNode/decommission"
MigrateMetaNode = "/metaNode/migrate"
GetMetaNode = "/metaNode/get"
AdminUpdateMetaNode = "/metaNode/update"
AdminUpdateDataNode = "/dataNode/update"
AdminGetInvalidNodes = "/invalid/nodes"
AdminLoadMetaPartition = "/metaPartition/load"
AdminDiagnoseMetaPartition = "/metaPartition/diagnose"
AdminDecommissionMetaPartition = "/metaPartition/decommission"
AdminChangeMetaPartitionLeader = "/metaPartition/changeleader"
AdminBalanceMetaPartitionLeader = "/metaPartition/balanceLeader"
AdminAddMetaReplica = "/metaReplica/add"
AdminDeleteMetaReplica = "/metaReplica/delete"
AdminPutDataPartitions = "/dataPartitions/set"
// admin multi version snapshot
AdminCreateVersion = "/multiVer/create"
AdminDelVersion = "/multiVer/del"
AdminGetVersionInfo = "/multiVer/get"
AdminGetAllVersionInfo = "/multiVer/getAll"
AdminGetVolVer = "/vol/getVer"
AdminSetVerStrategy = "/vol/SetVerStrategy"
// S3 lifecycle configuration APIS
SetBucketLifecycle = "/s3/setLifecycle"
GetBucketLifecycle = "/s3/getLifecycle"
DeleteBucketLifecycle = "/s3/deleteLifecycle"
AddLcNode = "/lcNode/add"
QueryDisableDisk = "/dataNode/queryDisableDisk"
// Operation response
GetMetaNodeTaskResponse = "/metaNode/response" // Method: 'POST', ContentType: 'application/json'
GetDataNodeTaskResponse = "/dataNode/response" // Method: 'POST', ContentType: 'application/json'
GetLcNodeTaskResponse = "/lcNode/response" // Method: 'POST', ContentType: 'application/json'
GetTopologyView = "/topo/get"
UpdateZone = "/zone/update"
GetAllZones = "/zone/list"
GetAllNodeSets = "/nodeSet/list"
GetNodeSet = "/nodeSet/get"
UpdateNodeSet = "/nodeSet/update"
// Header keys
SkipOwnerValidation = "Skip-Owner-Validation"
ForceDelete = "Force-Delete"
// APIs for user management
UserCreate = "/user/create"
UserDelete = "/user/delete"
UserUpdate = "/user/update"
UserUpdatePolicy = "/user/updatePolicy"
UserRemovePolicy = "/user/removePolicy"
UserDeleteVolPolicy = "/user/deleteVolPolicy"
UserGetInfo = "/user/info"
UserGetAKInfo = "/user/akInfo"
UserTransferVol = "/user/transferVol"
UserList = "/user/list"
UsersOfVol = "/vol/users"
// graphql api for header
HeadAuthorized = "Authorization"
ParamAuthorized = "_authorization"
UserKey = "_user_key"
UserInfoKey = "_user_info_key"
// quota
QuotaCreate = "/quota/create"
QuotaUpdate = "/quota/update"
QuotaDelete = "/quota/delete"
QuotaList = "/quota/list"
QuotaGet = "/quota/get"
// QuotaBatchModifyPath = "/quota/batchModifyPath"
QuotaListAll = "/quota/listAll"
// s3 qos api
S3QoSSet = "/s3/qos/set"
S3QoSGet = "/s3/qos/get"
S3QoSDelete = "/s3/qos/delete"
)
var GApiInfo map[string]string = map[string]string{
"admingetmasterapilist": AdminGetMasterApiList,
"adminsetapiqpslimit": AdminSetApiQpsLimit,
"admingetcluster": AdminGetCluster,
"adminsetclusterinfo": AdminSetClusterInfo,
"admingetdatapartition": AdminGetDataPartition,
"adminloaddatapartition": AdminLoadDataPartition,
"admincreatedatapartition": AdminCreateDataPartition,
"admincreatepreloaddatapartition": AdminCreatePreLoadDataPartition,
"admindecommissiondatapartition": AdminDecommissionDataPartition,
"admindiagnosedatapartition": AdminDiagnoseDataPartition,
"admindeletedatareplica": AdminDeleteDataReplica,
"adminadddatareplica": AdminAddDataReplica,
"admindeletevol": AdminDeleteVol,
"adminupdatevol": AdminUpdateVol,
"adminvolshrink": AdminVolShrink,
"adminvolexpand": AdminVolExpand,
"admincreatevol": AdminCreateVol,
"admingetvol": AdminGetVol,
"adminclusterfreeze": AdminClusterFreeze,
"adminclusterforbidmpdecommission": AdminClusterForbidMpDecommission,
"adminclusterstat": AdminClusterStat,
"admingetip": AdminGetIP,
"admincreatemetapartition": AdminCreateMetaPartition,
"adminsetmetanodethreshold": AdminSetMetaNodeThreshold,
"adminlistvols": AdminListVols,
"adminsetnodeinfo": AdminSetNodeInfo,
"admingetnodeinfo": AdminGetNodeInfo,
"admingetallnodesetgrpinfo": AdminGetAllNodeSetGrpInfo,
"admingetnodesetgrpinfo": AdminGetNodeSetGrpInfo,
"admingetisdomainon": AdminGetIsDomainOn,
"adminupdatenodesetcapcity": AdminUpdateNodeSetCapcity,
"adminupdatenodesetid": AdminUpdateNodeSetId,
"adminupdatedomaindatauseratio": AdminUpdateDomainDataUseRatio,
"adminupdatezoneexcluderatio": AdminUpdateZoneExcludeRatio,
"adminsetnoderdonly": AdminSetNodeRdOnly,
"adminsetdprdonly": AdminSetDpRdOnly,
"admindatapartitionchangeleader": AdminDataPartitionChangeLeader,
"adminsetdpdiscard": AdminSetDpDiscard,
"admingetdiscarddp": AdminGetDiscardDp,
//"adminclusterapi": AdminClusterAPI,
//"adminuserapi": AdminUserAPI,
//"adminvolumeapi": AdminVolumeAPI,
//"consoleiql": ConsoleIQL,
//"consoleloginapi": ConsoleLoginAPI,
//"consolemonitorapi": ConsoleMonitorAPI,
//"consolefile": ConsoleFile,
//"consolefiledown": ConsoleFileDown,
//"consolefileupload": ConsoleFileUpload,
"clientdatapartitions": ClientDataPartitions,
"clientvol": ClientVol,
"clientmetapartition": ClientMetaPartition,
"clientvolstat": ClientVolStat,
"clientmetapartitions": ClientMetaPartitions,
"qosgetstatus": QosGetStatus,
"qosgetclientslimitinfo": QosGetClientsLimitInfo,
"qosgetzonelimitinfo": QosGetZoneLimitInfo,
"qosupdate": QosUpdate,
//"qosupdatemagnify": QosUpdateMagnify,
"qosupdateclientparam": QosUpdateClientParam,
"qosupdatezonelimit": QosUpdateZoneLimit,
"qosupload": QosUpload,
"qosupdatemasterlimit": QosUpdateMasterLimit,
"addraftnode": AddRaftNode,
"removeraftnode": RemoveRaftNode,
"raftstatus": RaftStatus,
"adddatanode": AddDataNode,
"decommissiondatanode": DecommissionDataNode,
"migratedatanode": MigrateDataNode,
"canceldecommissiondatanode": CancelDecommissionDataNode,
"decommissiondisk": DecommissionDisk,
"getdatanode": GetDataNode,
"addmetanode": AddMetaNode,
"decommissionmetanode": DecommissionMetaNode,
"migratemetanode": MigrateMetaNode,
"getmetanode": GetMetaNode,
"adminupdatemetanode": AdminUpdateMetaNode,
"adminupdatedatanode": AdminUpdateDataNode,
"admingetinvalidnodes": AdminGetInvalidNodes,
"adminloadmetapartition": AdminLoadMetaPartition,
"admindiagnosemetapartition": AdminDiagnoseMetaPartition,
"admindecommissionmetapartition": AdminDecommissionMetaPartition,
"adminchangemetapartitionleader": AdminChangeMetaPartitionLeader,
"adminbalancemetapartitionleader": AdminBalanceMetaPartitionLeader,
"adminaddmetareplica": AdminAddMetaReplica,
"admindeletemetareplica": AdminDeleteMetaReplica,
"getmetanodetaskresponse": GetMetaNodeTaskResponse,
"getdatanodetaskresponse": GetDataNodeTaskResponse,
"gettopologyview": GetTopologyView,
"updatezone": UpdateZone,
"getallzones": GetAllZones,
"usercreate": UserCreate,
"userdelete": UserDelete,
"userupdate": UserUpdate,
"userupdatepolicy": UserUpdatePolicy,
"userremovepolicy": UserRemovePolicy,
"userdeletevolpolicy": UserDeleteVolPolicy,
"usergetinfo": UserGetInfo,
"usergetakinfo": UserGetAKInfo,
"usertransfervol": UserTransferVol,
"userlist": UserList,
"usersofvol": UsersOfVol,
}
// const TimeFormat = "2006-01-02 15:04:05"
const (
TimeFormat = "2006-01-02 15:04:05"
DefaultDirChildrenNumLimit = 20000000
MinDirChildrenNumLimit = 1000000
)
// HTTPReply uniform response structure
type HTTPReply struct {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data interface{} `json:"data"`
}
type HTTPReplyRaw struct {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data json.RawMessage `json:"data"`
}
func (raw *HTTPReplyRaw) Unmarshal(body []byte) error {
r := new(HTTPReplyRaw)
if err := json.Unmarshal(body, r); err != nil {
return fmt.Errorf("httpreply unmarshal [%s]", err.Error())
}
*raw = *r
return nil
}
func (raw *HTTPReplyRaw) Success() error {
if code := raw.Code; code != ErrCodeSuccess {
err := ParseErrorCode(code)
return fmt.Errorf("httpreply code[%d] err[%s] msg[%s]", code, err.Error(), raw.Msg)
}
return nil
}
func (raw *HTTPReplyRaw) Bytes() []byte {
return raw.Data
}
func (raw *HTTPReplyRaw) String() string {
return string(raw.Bytes())
}
func (raw *HTTPReplyRaw) Int64() (int64, error) {
return strconv.ParseInt(string(raw.Data), 10, 64)
}
func (raw *HTTPReplyRaw) Uint64() (uint64, error) {
return strconv.ParseUint(string(raw.Data), 10, 64)
}
func (raw *HTTPReplyRaw) Result(result interface{}) error {
return json.Unmarshal(raw.Data, result)
}
func UnmarshalHTTPReply(body []byte, result interface{}) error {
raw := new(HTTPReplyRaw)
if err := raw.Unmarshal(body); err != nil {
return err
}
if err := raw.Success(); err != nil {
return err
}
if result == nil {
return nil
}
switch v := result.(type) {
case *string:
*v = raw.String()
case *int64:
val, err := raw.Int64()
if err != nil {
return err
}
*v = val
case *uint64:
val, err := raw.Uint64()
if err != nil {
return err
}
*v = val
default:
return raw.Result(result)
}
return nil
}
// RegisterMetaNodeResp defines the response to register a meta node.
type RegisterMetaNodeResp struct {
ID uint64
}
type AclIpInfo struct {
Ip string
CTime int64
}
type AclRsp struct {
Info string
OK bool
List []*AclIpInfo
Reserve string
}
type UidSpaceRsp struct {
Info string
OK bool
UidSpaceArr []*UidSpaceInfo
Reserve string
}
type VolumeVerStrategy struct {
KeepVerCnt int
Periodic int
Enable bool
ForceUpdate bool
UTime time.Time
}
func (v *VolumeVerStrategy) GetPeriodic() int {
return v.Periodic
}
func (v *VolumeVerStrategy) GetPeriodicSecond() int {
// return v.Periodic*24*3600
return v.Periodic * 3600
}
func (v *VolumeVerStrategy) TimeUp(curTime time.Time) bool {
return v.UTime.Add(time.Second * time.Duration(v.GetPeriodicSecond())).Before(curTime)
}
type VolumeVerInfo struct {
Name string
VerSeq uint64
VerSeqPrepare uint64
VerPrepareStatus uint8
Enabled bool
}
// ClusterInfo defines the cluster infomation.
type ClusterInfo struct {
Cluster string
Ip string
MetaNodeDeleteBatchCount uint64
MetaNodeDeleteWorkerSleepMs uint64
DataNodeDeleteLimitRate uint64
DataNodeAutoRepairLimitRate uint64
DpMaxRepairErrCnt uint64
DirChildrenNumLimit uint32
EbsAddr string
ServicePath string
ClusterUuid string
ClusterUuidEnable bool
}
// CreateDataPartitionRequest defines the request to create a data partition.
type CreateDataPartitionRequest struct {
PartitionTyp int
PartitionId uint64
PartitionSize int
ReplicaNum int
VolumeId string
IsRandomWrite bool
Members []Peer
Hosts []string
CreateType int
LeaderSize int
DecommissionedDisks []string
IsMultiVer bool
VerSeq uint64
}
// CreateDataPartitionResponse defines the response to the request of creating a data partition.
type CreateDataPartitionResponse struct {
PartitionId uint64
Status uint8
Result string
}
// DeleteDataPartitionRequest defines the request to delete a data partition.
type DeleteDataPartitionRequest struct {
DataPartitionType string
PartitionId uint64
PartitionSize int
}
// DeleteDataPartitionResponse defines the response to the request of deleting a data partition.
type DeleteDataPartitionResponse struct {
Status uint8
Result string
PartitionId uint64
}
// DataPartitionDecommissionRequest defines the request of decommissioning a data partition.
type DataPartitionDecommissionRequest struct {
PartitionId uint64
RemovePeer Peer
AddPeer Peer
}
// AddDataPartitionRaftMemberRequest defines the request of add raftMember a data partition.
type AddDataPartitionRaftMemberRequest struct {
PartitionId uint64
AddPeer Peer
}
// RemoveDataPartitionRaftMemberRequest defines the request of add raftMember a data partition.
type RemoveDataPartitionRaftMemberRequest struct {
PartitionId uint64
RemovePeer Peer
Force bool
}
// AddMetaPartitionRaftMemberRequest defines the request of add raftMember a meta partition.
type AddMetaPartitionRaftMemberRequest struct {
PartitionId uint64
AddPeer Peer
}
// RemoveMetaPartitionRaftMemberRequest defines the request of add raftMember a meta partition.
type RemoveMetaPartitionRaftMemberRequest struct {
PartitionId uint64
RemovePeer Peer
}
// LoadDataPartitionRequest defines the request of loading a data partition.
type LoadDataPartitionRequest struct {
PartitionId uint64
}
// LoadDataPartitionResponse defines the response to the request of loading a data partition.
type LoadDataPartitionResponse struct {
PartitionId uint64
Used uint64
PartitionSnapshot []*File
Status uint8
PartitionStatus int
Result string
VolName string
}
type StopDataPartitionRepairRequest struct {
PartitionId uint64
Stop bool
}
// DeleteDataPartitionResponse defines the response to the request of deleting a data partition.
type StopDataPartitionRepairResponse struct {
Status uint8
Result string
PartitionId uint64
}
// File defines the file struct.
type File struct {
Name string
Crc uint32
Size uint32
Modified int64
ApplyID uint64
}
// LoadMetaPartitionMetricRequest defines the request of loading the meta partition metrics.
type LoadMetaPartitionMetricRequest struct {
PartitionID uint64
Start uint64
End uint64
}
// LoadMetaPartitionMetricResponse defines the response to the request of loading the meta partition metrics.
type LoadMetaPartitionMetricResponse struct {
Start uint64
End uint64
MaxInode uint64
Status uint8
Result string
}
type UidLimitToMetaNode struct {
UidLimitInfo []*UidSpaceInfo
}
type QosToDataNode struct {
EnableDiskQos bool
QosIopsReadLimit uint64
QosIopsWriteLimit uint64
QosFlowReadLimit uint64
QosFlowWriteLimit uint64
}
// MultiVersionOpRequest defines the request of
type MultiVersionOpRequest struct {
VolumeID string
VerSeq uint64
Op uint8
Addr string
VolVerList []*VolVersionInfo
}
// MultiVersionOpResponse defines the response to the request of l.
type MultiVersionOpResponse struct {
VolumeID string
Addr string
Op uint8
VerSeq uint64
Status uint8
Result string
}
type QuotaHeartBeatInfos struct {
QuotaHbInfos []*QuotaHeartBeatInfo
}
type TxInfo struct {
Volume string
Mask TxOpMask
OpLimitVal int
}
type TxInfos struct {
TxInfo []*TxInfo
}
// HeartBeatRequest define the heartbeat request.
type HeartBeatRequest struct {
CurrTime int64
MasterAddr string
FLReadVols []string
QosToDataNode
FileStatsEnable bool
UidLimitToMetaNode
QuotaHeartBeatInfos
TxInfos
ForbiddenVols []string
DisableAuditVols []string
DecommissionDisks []string // NOTE: for datanode
}
// DataPartitionReport defines the partition report.
type DataPartitionReport struct {
VolName string
PartitionID uint64
PartitionStatus int
Total uint64
Used uint64
DiskPath string
IsLeader bool
ExtentCount int
NeedCompare bool
DecommissionRepairProgress float64
}
type DataNodeQosResponse struct {
IopsRLimit uint64
IopsWLimit uint64
FlowRlimit uint64
FlowWlimit uint64
Status uint8
Result string
}
type BadDiskStat struct {
DiskPath string
TotalPartitionCnt int
DiskErrPartitionList []uint64
}
// DataNodeHeartbeatResponse defines the response to the data node heartbeat.
type DataNodeHeartbeatResponse struct {
Total uint64
Used uint64
Available uint64
TotalPartitionSize uint64 // volCnt * volsize
RemainingCapacity uint64 // remaining capacity to create partition
CreatedPartitionCnt uint32
MaxCapacity uint64 // maximum capacity to create partition
StartTime int64
ZoneName string
PartitionReports []*DataPartitionReport
Status uint8
Result string
BadDisks []string // Keep this old field for compatibility
BadDiskStats []BadDiskStat // key: disk path
CpuUtil float64 `json:"cpuUtil"`
IoUtils map[string]float64 `json:"ioUtil"`
}
// MetaPartitionReport defines the meta partition report.
type MetaPartitionReport struct {
PartitionID uint64
Start uint64
End uint64
Status int
Size uint64
MaxInodeID uint64
IsLeader bool
VolName string
InodeCnt uint64
DentryCnt uint64
TxCnt uint64
TxRbInoCnt uint64
TxRbDenCnt uint64
FreeListLen uint64
UidInfo []*UidReportSpaceInfo
QuotaReportInfos []*QuotaReportInfo
}
// MetaNodeHeartbeatResponse defines the response to the meta node heartbeat request.
type MetaNodeHeartbeatResponse struct {
ZoneName string
Total uint64
MemUsed uint64
MetaPartitionReports []*MetaPartitionReport
Status uint8
Result string
CpuUtil float64 `json:"cpuUtil"`
}
// LcNodeHeartbeatResponse defines the response to the lc node heartbeat.
type LcNodeHeartbeatResponse struct {
Status uint8
Result string
LcTaskCountLimit int
LcScanningTasks map[string]*LcNodeRuleTaskResponse
SnapshotScanningTasks map[string]*SnapshotVerDelTaskResponse
}
// DeleteFileRequest defines the request to delete a file.
type DeleteFileRequest struct {
VolId uint64
Name string
}
// DeleteFileResponse defines the response to the request of deleting a file.
type DeleteFileResponse struct {
Status uint8
Result string
VolId uint64
Name string
}
// DeleteMetaPartitionRequest defines the request of deleting a meta partition.
type DeleteMetaPartitionRequest struct {
PartitionID uint64
}
// DeleteMetaPartitionResponse defines the response to the request of deleting a meta partition.
type DeleteMetaPartitionResponse struct {
PartitionID uint64
Status uint8
Result string
}
// UpdateMetaPartitionRequest defines the request to update a meta partition.
type UpdateMetaPartitionRequest struct {
PartitionID uint64
VolName string
Start uint64
End uint64
}
// UpdateMetaPartitionResponse defines the response to the request of updating the meta partition.
type UpdateMetaPartitionResponse struct {
PartitionID uint64
VolName string
End uint64
Status uint8
Result string
}
// MetaPartitionDecommissionRequest defines the request of decommissioning a meta partition.
type MetaPartitionDecommissionRequest struct {
PartitionID uint64
VolName string
RemovePeer Peer
AddPeer Peer
}
// MetaPartitionDecommissionResponse defines the response to the request of decommissioning a meta partition.
type MetaPartitionDecommissionResponse struct {
PartitionID uint64
VolName string
Status uint8
Result string
}
// MetaPartitionLoadRequest defines the request to load meta partition.
type MetaPartitionLoadRequest struct {
PartitionID uint64
}
// MetaPartitionLoadResponse defines the response to the request of loading meta partition.
type MetaPartitionLoadResponse struct {
PartitionID uint64
DoCompare bool
ApplyID uint64
CommittedID uint64
MaxInode uint64
DentryCount uint64
InodeCount uint64
Addr string
}
// DataPartitionResponse defines the response from a data node to the master that is related to a data partition.
type DataPartitionResponse struct {
PartitionType int
PartitionID uint64
Status int8
ReplicaNum uint8
Hosts []string
LeaderAddr string
Epoch uint64
IsRecover bool
PartitionTTL int64
IsDiscard bool
}
// DataPartitionsView defines the view of a data partition
type DataPartitionsView struct {
DataPartitions []*DataPartitionResponse
}
func NewDataPartitionsView() (dataPartitionsView *DataPartitionsView) {
dataPartitionsView = new(DataPartitionsView)
dataPartitionsView.DataPartitions = make([]*DataPartitionResponse, 0)
return
}
// MetaPartitionView defines the view of a meta partition
type MetaPartitionView struct {
PartitionID uint64
Start uint64
End uint64
MaxInodeID uint64
InodeCount uint64
DentryCount uint64
FreeListLen uint64
TxCnt uint64
TxRbInoCnt uint64
TxRbDenCnt uint64
IsRecover bool
Members []string
LeaderAddr string
Status int8
}
type DataNodeDisksRequest struct{}
type DataNodeDisksResponse struct{}
type OSSSecure struct {
AccessKey string
SecretKey string
}
// VolView defines the view of a volume
type VolView struct {
Name string
Owner string
Status uint8
FollowerRead bool
MetaPartitions []*MetaPartitionView
DataPartitions []*DataPartitionResponse
DomainOn bool
OSSSecure *OSSSecure
CreateTime int64
DeleteLockTime int64
CacheTTL int
VolType int
}
func (v *VolView) SetOwner(owner string) {
v.Owner = owner
}
func (v *VolView) SetOSSSecure(accessKey, secretKey string) {
v.OSSSecure = &OSSSecure{AccessKey: accessKey, SecretKey: secretKey}
}
func NewVolView(name string, status uint8, followerRead bool, createTime int64, cacheTTL int, volType int, deleteLockTime int64) (view *VolView) {
view = new(VolView)
view.Name = name
view.FollowerRead = followerRead
view.CreateTime = createTime
view.DeleteLockTime = deleteLockTime
view.Status = status
view.MetaPartitions = make([]*MetaPartitionView, 0)
view.DataPartitions = make([]*DataPartitionResponse, 0)
view.CacheTTL = cacheTTL
view.VolType = volType
return
}
func NewMetaPartitionView(partitionID, start, end uint64, status int8) (mpView *MetaPartitionView) {
mpView = new(MetaPartitionView)
mpView.PartitionID = partitionID
mpView.Start = start
mpView.End = end
mpView.Status = status
mpView.Members = make([]string, 0)
return
}
const (
QosStateNormal uint8 = 0x01
QosStateHitLimit uint8 = 0x02
MinIopsLimit uint64 = 3
MinFLowLimit uint64 = 128 * util.KB
)
const (
IopsReadType uint32 = 0x01
IopsWriteType uint32 = 0x02
FlowReadType uint32 = 0x03
FlowWriteType uint32 = 0x04
)
const (
QosDefaultBurst = 16000000
QosDefaultClientCnt uint32 = 100
QosDefaultDiskMaxFLowLimit int = 0x7FFFFFFF
QosDefaultDiskMaxIoLimit int = 100000
)
func QosTypeString(factorType uint32) string {
switch factorType {
case IopsReadType:
return "IopsRead"
case IopsWriteType:
return "IopsWrite"
case FlowReadType:
return "FlowRead"
case FlowWriteType:
return "FlowWrite"
default:
return "unkown"
}
}
type ClientLimitInfo struct {
UsedLimit uint64
UsedBuffer uint64
Used uint64
Need uint64
}
type ClientReportLimitInfo struct {
ID uint64
FactorMap map[uint32]*ClientLimitInfo
Host string
Status uint8
_ string // reserved
}
func NewClientReportLimitInfo() *ClientReportLimitInfo {
return &ClientReportLimitInfo{
FactorMap: make(map[uint32]*ClientLimitInfo),
}
}
type LimitRsp2Client struct {
ID uint64
Enable bool
ReqPeriod uint32
HitTriggerCnt uint8
FactorMap map[uint32]*ClientLimitInfo
Magnify map[uint32]uint32
_ string // reserved
}
func NewLimitRsp2Client() *LimitRsp2Client {
limit := &LimitRsp2Client{
FactorMap: make(map[uint32]*ClientLimitInfo),
Magnify: make(map[uint32]uint32),
}
return limit
}
type UidSimpleInfo struct {
UID uint32
Limited bool
}
// SimpleVolView defines the simple view of a volume
type SimpleVolView struct {
ID uint64
Name string
Owner string
ZoneName string
DpReplicaNum uint8
MpReplicaNum uint8
InodeCount uint64
DentryCount uint64
MaxMetaPartitionID uint64
Status uint8
Capacity uint64 // GB
RwDpCnt int
MpCnt int
DpCnt int
FollowerRead bool
NeedToLowerReplica bool
Authenticate bool
CrossZone bool
DefaultPriority bool
DomainOn bool
CreateTime string
DeleteLockTime int64
EnableToken bool
EnablePosixAcl bool
EnableQuota bool
EnableTransaction string
TxTimeout int64
TxConflictRetryNum int64
TxConflictRetryInterval int64
TxOpLimit int
Description string
DpSelectorName string
DpSelectorParm string
DefaultZonePrior bool
DpReadOnlyWhenVolFull bool
VolType int
ObjBlockSize int
CacheCapacity uint64
CacheAction int
CacheThreshold int
CacheHighWater int
CacheLowWater int
CacheLruInterval int
CacheTtl int
CacheRule string
PreloadCapacity uint64
Uids []UidSimpleInfo
// multi version snapshot
LatestVer uint64
Forbidden bool
EnableAuditLog bool
}
type NodeSetInfo struct {
ID uint64
ZoneName string
Capacity int
DataUseRatio float64
MetaUseRatio float64
MetaUsed uint64
MetaTotal uint64
MetaNodes []*MetaNodeInfo
DataUsed uint64
DataTotal uint64
DataNodes []*DataNodeInfo
}
type SimpleNodeSetGrpInfo struct {
ID uint64
Status uint8
NodeSetInfo []NodeSetInfo
}
type SimpleNodeSetGrpInfoList struct {
DomainId uint64
Status uint8
SimpleNodeSetGrpInfo []*SimpleNodeSetGrpInfo
}
type DomainNodeSetGrpInfoList struct {
DomainOn bool
DataRatioLimit float64
ZoneExcludeRatioLimit float64
NeedDomain bool
ExcludeZones []string
DomainNodeSetGrpInfo []*SimpleNodeSetGrpInfoList
}
// MasterAPIAccessResp defines the response for getting meta partition
type MasterAPIAccessResp struct {
APIResp APIAccessResp `json:"api_resp"`
Data []byte `json:"data"`
}
type VolInfo struct {
Name string
Owner string
CreateTime int64
Status uint8
TotalSize uint64
UsedSize uint64
DpReadOnlyWhenVolFull bool
}
func NewVolInfo(name, owner string, createTime int64, status uint8, totalSize, usedSize uint64, dpReadOnlyWhenVolFull bool) *VolInfo {
return &VolInfo{
Name: name,
Owner: owner,
CreateTime: createTime,
Status: status,
TotalSize: totalSize,
UsedSize: usedSize,
DpReadOnlyWhenVolFull: dpReadOnlyWhenVolFull,
}
}
// ZoneView define the view of zone
type ZoneView struct {
Name string
Status string
DataNodesetSelector string
MetaNodesetSelector string
NodeSet map[uint64]*NodeSetView
}
type NodeSetView struct {
DataNodeLen int
MetaNodeLen int
MetaNodes []NodeView
DataNodes []NodeView
}
// TopologyView provides the view of the topology view of the cluster
type TopologyView struct {
Zones []*ZoneView
}
const (
PartitionTypeNormal = 0
PartitionTypeCache = 1
PartitionTypePreLoad = 2
)
func GetDpType(volType int, isPreload bool) int {
if volType == VolumeTypeHot {
return PartitionTypeNormal
}
if isPreload {
return PartitionTypePreLoad
}
return PartitionTypeCache
}
func IsCacheDp(typ int) bool {
return typ == PartitionTypeCache
}
func IsNormalDp(typ int) bool {
return typ == PartitionTypeNormal
}
func IsPreLoadDp(typ int) bool {
return typ == PartitionTypePreLoad
}
const (
VolumeTypeHot = 0
VolumeTypeCold = 1
)
func IsCold(typ int) bool {
return typ == VolumeTypeCold
}
func IsHot(typ int) bool {
return typ == VolumeTypeHot
}
const (
NoCache = 0
RCache = 1
RWCache = 2
)
const (
LFClient = 1 // low frequency client
)
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"fmt"
"time"
)
const (
TaskFailed = 2
TaskStart = 0
TaskSucceeds = 1
TaskRunning = 3
ResponseInterval = 5
ResponseTimeOut = 100
MaxSendCount = 5
)
// AdminTask defines the administration task.
type AdminTask struct {
ID string
PartitionID uint64
OpCode uint8
OperatorAddr string
Status int8
SendTime int64
CreateTime int64
SendCount uint8
Request interface{}
Response interface{}
}
// ToString returns the string format of the task.
func (t *AdminTask) ToString() (msg string) {
msg = fmt.Sprintf("ID[%v] OpCode[%d] Status[%d] LastSendTime[%v] SendCount[%v] Request[%v] Response[%v]",
t.ID, t.OpCode, t.Status, t.SendTime, t.SendCount, t.Request, t.Response)
return
}
func (t *AdminTask) IdString() string {
return fmt.Sprintf("id:%s_sendTime_%d_createTime_%d", t.ID, t.SendTime, t.CreateTime)
}
// CheckTaskNeedSend checks if the task needs to be sent out.
func (t *AdminTask) CheckTaskNeedSend() (needRetry bool) {
if (int)(t.SendCount) < MaxSendCount && time.Now().Unix()-t.SendTime > (int64)(ResponseInterval) {
needRetry = true
}
return
}
// CheckTaskTimeOut checks if the task is timed out.
func (t *AdminTask) CheckTaskTimeOut() (notResponse bool) {
if (int)(t.SendCount) >= MaxSendCount || (t.SendTime > 0 && (time.Now().Unix()-t.SendTime > int64(ResponseTimeOut))) {
notResponse = true
}
return
}
// SetStatus sets the status of the task.
func (t *AdminTask) SetStatus(status int8) {
t.Status = status
}
// IsTaskSuccessful returns if the task has been executed successful.
func (t *AdminTask) IsTaskSuccessful() (isSuccess bool) {
if t.Status == TaskSucceeds {
isSuccess = true
}
return
}
// IsTaskFailed returns if the task failed.
func (t *AdminTask) IsTaskFailed() (isFail bool) {
if t.Status == TaskFailed {
isFail = true
}
return
}
// IsUrgentTask returns if the task is urgent.
func (t *AdminTask) IsUrgentTask() bool {
return t.isCreateTask() || t.isLoadTask() || t.isUpdateMetaPartitionTask()
}
// isUpdateMetaPartitionTask checks if the task is to update the meta partition.
func (t *AdminTask) isUpdateMetaPartitionTask() bool {
return t.OpCode == OpUpdateMetaPartition
}
func (t *AdminTask) isLoadTask() bool {
return t.OpCode == OpLoadDataPartition
}
func (t *AdminTask) isCreateTask() bool {
return t.OpCode == OpCreateDataPartition || t.OpCode == OpCreateMetaPartition
}
// IsHeartbeatTask returns if the task is a heartbeat task.
func (t *AdminTask) IsHeartbeatTask() bool {
return t.OpCode == OpDataNodeHeartbeat || t.OpCode == OpMetaNodeHeartbeat || t.OpCode == OpLcNodeHeartbeat
}
// NewAdminTask returns a new adminTask.
func NewAdminTask(opCode uint8, opAddr string, request interface{}) (t *AdminTask) {
t = new(AdminTask)
t.OpCode = opCode
t.Request = request
t.OperatorAddr = opAddr
t.ID = fmt.Sprintf("addr[%v]_op[%v]", t.OperatorAddr, t.OpCode)
t.CreateTime = time.Now().Unix()
return
}
// NewAdminTaskEx returns a new adminTask.
func NewAdminTaskEx(opCode uint8, opAddr string, request interface{}, reqID string) (t *AdminTask) {
t = new(AdminTask)
t.OpCode = opCode
t.Request = request
t.OperatorAddr = opAddr
t.ID = fmt.Sprintf("addr[%v]_op[%v]_reqID[%v]", t.OperatorAddr, t.OpCode, reqID)
t.CreateTime = time.Now().Unix()
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"encoding/base64"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"time"
"github.com/cubefs/cubefs/util/caps"
"github.com/cubefs/cubefs/util/cryptoutil"
"github.com/cubefs/cubefs/util/keystore"
)
// ServiceID defines the type of tickets
type ServiceID uint32
// MsgType defines the type of req/resp for message
type MsgType uint32
// Nonce defines the nonce to mitigate the replay attack
type Nonce uint64
const (
APIRsc = "API"
APIAccess = "access"
capSeparator = ":"
reqLiveLength = 10
ClientMessage = "Token"
OwnerVOLRsc = "OwnerVOL"
NoneOwnerVOLRsc = "NoneOwnerVOL"
VOLAccess = "*"
)
// api
const (
// Client APIs
ClientGetTicket = "/client/getticket"
// Admin APIs
AdminCreateKey = "/admin/createkey"
AdminDeleteKey = "/admin/deletekey"
AdminGetKey = "/admin/getkey"
AdminAddCaps = "/admin/addcaps"
AdminDeleteCaps = "/admin/deletecaps"
AdminGetCaps = "/admin/getcaps"
// raft node APIs
AdminAddRaftNode = "/admin/addraftnode"
AdminRemoveRaftNode = "/admin/removeraftnode"
// Object node APIs
OSAddCaps = "/os/addcaps"
OSDeleteCaps = "/os/deletecaps"
OSGetCaps = "/os/getcaps"
)
const (
// AuthServiceID defines ticket for authnode access (not supported)
AuthServiceID = "AuthService"
// MasterServiceID defines ticket for master access
MasterServiceID = "MasterService"
// MetaServiceID defines ticket for metanode access (not supported)
MetaServiceID = "MetanodeService"
// DataServiceID defines ticket for datanode access (not supported)
DataServiceID = "DatanodeService"
// ObjectServiceID defines ticket for objectnode access
ObjectServiceID = "ObjectService"
)
const (
MasterNode = "master"
MetaNode = "metanode"
DataNode = "datanode"
)
const (
// MsgAuthBase define the starting value for auth message
MsgAuthBase MsgType = 0x100000
// MsgAuthTicketReq request type for an auth ticket
MsgAuthTicketReq MsgType = MsgAuthBase + 0x10000
// MsgAuthTicketResp respose type for an auth ticket
MsgAuthTicketResp MsgType = MsgAuthBase + 0x10001
// MsgMasterTicketReq request type for a master ticket
MsgMasterTicketReq MsgType = MsgAuthBase + 0x20000
// MsgMasterTicketResp response type for a master ticket
MsgMasterTicketResp MsgType = MsgAuthBase + 0x20001
// MsgMetaTicketReq request type for a metanode ticket
MsgMetaTicketReq MsgType = MsgAuthBase + 0x30000
// MsgMetaTicketResp response type for a metanode ticket
MsgMetaTicketResp MsgType = MsgAuthBase + 0x30001
// MsgDataTicketReq request type for a datanode ticket
MsgDataTicketReq MsgType = MsgAuthBase + 0x40000
// MsgDataTicketResp response type for a datanode ticket
MsgDataTicketResp MsgType = MsgAuthBase + 0x40001
// MsgAuthCreateKeyReq request type for authnode add key
MsgAuthCreateKeyReq MsgType = MsgAuthBase + 0x51000
// MsgAuthCreateKeyResp response type for authnode add key
MsgAuthCreateKeyResp MsgType = MsgAuthBase + 0x51001
// MsgAuthDeleteKeyReq request type for authnode delete key
MsgAuthDeleteKeyReq MsgType = MsgAuthBase + 0x52000
// MsgAuthDeleteKeyResp response type for authnode delete key
MsgAuthDeleteKeyResp MsgType = MsgAuthBase + 0x52001
// MsgAuthGetKeyReq request type for authnode get key info
MsgAuthGetKeyReq MsgType = MsgAuthBase + 0x53000
// MsgAuthGetKeyResp response type for authnode get key info
MsgAuthGetKeyResp MsgType = MsgAuthBase + 0x53001
// MsgAuthAddCapsReq request type for authnode add caps
MsgAuthAddCapsReq MsgType = MsgAuthBase + 0x54000
// MsgAuthAddCapsResp response type for authnode add caps
MsgAuthAddCapsResp MsgType = MsgAuthBase + 0x54001
// MsgAuthDeleteCapsReq request type for authnode add caps
MsgAuthDeleteCapsReq MsgType = MsgAuthBase + 0x55000
// MsgAuthDeleteCapsResp response type for authnode add caps
MsgAuthDeleteCapsResp MsgType = MsgAuthBase + 0x55001
// MsgAuthGetCapsReq request type for authnode add caps
MsgAuthGetCapsReq MsgType = MsgAuthBase + 0x56000
// MsgAuthGetCapsResp response type for authnode add caps
MsgAuthGetCapsResp MsgType = MsgAuthBase + 0x56001
// MsgAuthAddRaftNodeReq request type for authnode add node
MsgAuthAddRaftNodeReq MsgType = MsgAuthBase + 0x57000
// MsgAuthAddRaftNodeResp response type for authnode remove node
MsgAuthAddRaftNodeResp MsgType = MsgAuthBase + 0x57001
// MsgAuthRemoveRaftNodeReq request type for authnode remove node
MsgAuthRemoveRaftNodeReq MsgType = MsgAuthBase + 0x58000
// MsgAuthRemoveRaftNodeResp response type for authnode remove node
MsgAuthRemoveRaftNodeResp MsgType = MsgAuthBase + 0x58001
// MsgAuthOSAddCapsReq request type from ObjectNode to add caps
MsgAuthOSAddCapsReq MsgType = MsgAuthBase + 0x61000
// MsgAuthOSAddCapsResp request type from ObjectNode to add caps
MsgAuthOSAddCapsResp MsgType = MsgAuthBase + 0x61001
// MsgAuthOSDeleteCapsReq request type from ObjectNode to delete caps
MsgAuthOSDeleteCapsReq MsgType = MsgAuthBase + 0x62000
// MsgAuthOSDeleteCapsResp request type from ObjectNode to delete caps
MsgAuthOSDeleteCapsResp MsgType = MsgAuthBase + 0x62001
// MsgAuthOSGetCapsReq request type from ObjectNode to get caps
MsgAuthOSGetCapsReq MsgType = MsgAuthBase + 0x63000
// MsgAuthOSGetCapsResp response type from ObjectNode to get caps
MsgAuthOSGetCapsResp MsgType = MsgAuthBase + 0x63001
// MsgMasterAPIAccessReq request type for master api access
MsgMasterAPIAccessReq MsgType = 0x60000
// MsgMasterAPIAccessResp response type for master api access
MsgMasterAPIAccessResp MsgType = 0x60001
// Master API ClientVol
MsgMasterFetchVolViewReq MsgType = MsgMasterAPIAccessReq + 0x10000
// Master API cluster management
MsgMasterClusterFreezeReq MsgType = MsgMasterAPIAccessReq + 0x20100
MsgMasterAddRaftNodeReq MsgType = MsgMasterAPIAccessReq + 0x20200
MsgMasterRemoveRaftNodeReq MsgType = MsgMasterAPIAccessReq + 0x20300
MsgMasterSetNodeInfoReq MsgType = MsgMasterAPIAccessReq + 0x20400
MsgMasterSetNodeRdOnlyReq MsgType = MsgMasterAPIAccessReq + 0x20500
MsgMasterAutoDecommissionReq MsgType = MsgMasterAPIAccessReq + 0x20600
// Master API volume management
MsgMasterCreateVolReq MsgType = MsgMasterAPIAccessReq + 0x30100
MsgMasterDeleteVolReq MsgType = MsgMasterAPIAccessReq + 0x30200
MsgMasterUpdateVolReq MsgType = MsgMasterAPIAccessReq + 0x30300
MsgMasterVolShrinkReq MsgType = MsgMasterAPIAccessReq + 0x30400
MsgMasterVolExpandReq MsgType = MsgMasterAPIAccessReq + 0x30500
// Master API meta partition management
MsgMasterLoadMetaPartitionReq MsgType = MsgMasterAPIAccessReq + 0x40100
MsgMasterDecommissionMetaPartitionReq MsgType = MsgMasterAPIAccessReq + 0x40200
MsgMasterChangeMetaPartitionLeaderReq MsgType = MsgMasterAPIAccessReq + 0x40300
MsgMasterCreateMetaPartitionReq MsgType = MsgMasterAPIAccessReq + 0x40400
MsgMasterAddMetaReplicaReq MsgType = MsgMasterAPIAccessReq + 0x40500
MsgMasterDeleteMetaReplicaReq MsgType = MsgMasterAPIAccessReq + 0x40600
MsgMasterQosUpdateReq MsgType = MsgMasterAPIAccessReq + 0x40700
MsgMasterQosUpdateZoneLimitReq MsgType = MsgMasterAPIAccessReq + 0x40800
MsgMasterQosUpdateMasterLimitReq MsgType = MsgMasterAPIAccessReq + 0x40900
MsgMasterQosUpdateClientParamReq MsgType = MsgMasterAPIAccessReq + 0x40a00
// Master API data partition management
MsgMasterCreateDataPartitionReq MsgType = MsgMasterAPIAccessReq + 0x50100
MsgMasterDataPartitionChangeLeaderReq MsgType = MsgMasterAPIAccessReq + 0x50200
MsgMasterLoadDataPartitionReq MsgType = MsgMasterAPIAccessReq + 0x50300
MsgMasterDecommissionDataPartitionReq MsgType = MsgMasterAPIAccessReq + 0x50400
MsgMasterAddDataReplicaReq MsgType = MsgMasterAPIAccessReq + 0x50500
MsgMasterDeleteDataReplicaReq MsgType = MsgMasterAPIAccessReq + 0x50600
MsgMasterSetDpRdOnlyReq MsgType = MsgMasterAPIAccessReq + 0x50700
MsgMasterReportLackDataPartitions MsgType = MsgMasterAPIAccessReq + 0x50800
// Master API meta node management
MsgMasterAddMetaNodeReq MsgType = MsgMasterAPIAccessReq + 0x60100
MsgMasterDecommissionMetaNodeReq MsgType = MsgMasterAPIAccessReq + 0x60200
MsgMasterMigrateMetaNodeReq MsgType = MsgMasterAPIAccessReq + 0x60300
MsgMasterSetMetaNodeThresholdReq MsgType = MsgMasterAPIAccessReq + 0x60400
MsgMasterUpdateMetaNodeReq MsgType = MsgMasterAPIAccessReq + 0x60500
// Master API data node management
MsgMasterAddDataNodeReq MsgType = MsgMasterAPIAccessReq + 0x70100
MsgMasterDecommissionDataNodeReq MsgType = MsgMasterAPIAccessReq + 0x70200
MsgMasterMigrateDataNodeReq MsgType = MsgMasterAPIAccessReq + 0x70300
MsgMasterCancelDecommissionDataNodeReq MsgType = MsgMasterAPIAccessReq + 0x70400
MsgMasterDecommissionDiskReq MsgType = MsgMasterAPIAccessReq + 0x70500
MsgMasterUpdateNodeSetCapcityReq MsgType = MsgMasterAPIAccessReq + 0x70600
MsgMasterUpdateNodeSetIdReq MsgType = MsgMasterAPIAccessReq + 0x70700
MsgMasterUpdateDomainDataUseRatioReq MsgType = MsgMasterAPIAccessReq + 0x70800
MsgMasterUpdateZoneExcludeRatioReq MsgType = MsgMasterAPIAccessReq + 0x70900
MsgMasterRecommissionDiskReq MsgType = MsgMasterAPIAccessReq + 0x70a00
// Master API user management
MsgMasterUserCreateReq MsgType = MsgMasterAPIAccessReq + 0x80100
MsgMasterUserDeleteReq MsgType = MsgMasterAPIAccessReq + 0x80200
MsgMasterUserUpdateReq MsgType = MsgMasterAPIAccessReq + 0x80300
MsgMasterUserUpdatePolicyReq MsgType = MsgMasterAPIAccessReq + 0x80400
MsgMasterUserRemovePolicyReq MsgType = MsgMasterAPIAccessReq + 0x80500
MsgMasterUserDeleteVolPolicyReq MsgType = MsgMasterAPIAccessReq + 0x80600
MsgMasterUserTransferVolReq MsgType = MsgMasterAPIAccessReq + 0x80700
// Master API zone management
MsgMasterUpdateZoneReq MsgType = MsgMasterAPIAccessReq + 0x90100
)
// HTTPAuthReply uniform response structure
type HTTPAuthReply = HTTPReply
// MsgType2ResourceMap define the mapping from message type to resource
var MsgType2ResourceMap = map[MsgType]string{
MsgAuthCreateKeyReq: "auth:createkey",
MsgAuthDeleteKeyReq: "auth:deletekey",
MsgAuthGetKeyReq: "auth:getkey",
MsgAuthAddCapsReq: "auth:addcaps",
MsgAuthDeleteCapsReq: "auth:deletecaps",
MsgAuthGetCapsReq: "auth:getcaps",
MsgAuthAddRaftNodeReq: "auth:addnode",
MsgAuthRemoveRaftNodeReq: "auth:removenode",
MsgAuthOSAddCapsReq: "auth:osaddcaps",
MsgAuthOSDeleteCapsReq: "auth:osdeletecaps",
MsgAuthOSGetCapsReq: "auth:osgetcaps",
MsgMasterFetchVolViewReq: "master:getvol",
// Master API cluster management
MsgMasterClusterFreezeReq: "master:clusterfreeze",
MsgMasterAddRaftNodeReq: "master:addraftnode",
MsgMasterRemoveRaftNodeReq: "master:removeraftnode",
MsgMasterSetNodeInfoReq: "master:setnodeinfo",
MsgMasterSetNodeRdOnlyReq: "master:sernoderdonly",
MsgMasterAutoDecommissionReq: "master:autodecommission",
// Master API volume management
MsgMasterCreateVolReq: "master:createvol",
MsgMasterDeleteVolReq: "master:deletevol",
MsgMasterUpdateVolReq: "master:updatevol",
MsgMasterVolShrinkReq: "master:volshrink",
MsgMasterVolExpandReq: "master:volexpand",
// Master API meta partition management
MsgMasterLoadMetaPartitionReq: "master:loadmetapartition",
MsgMasterDecommissionMetaPartitionReq: "master:decommissionmetapartition",
MsgMasterChangeMetaPartitionLeaderReq: "master:changemetapartitionleader",
MsgMasterCreateMetaPartitionReq: "master:createmetapartition",
MsgMasterAddMetaReplicaReq: "master:addmetareplica",
MsgMasterDeleteMetaReplicaReq: "master:deletemetareplica",
MsgMasterQosUpdateReq: "master:qosupdate",
MsgMasterQosUpdateZoneLimitReq: "master:qosupdatezonelimit",
MsgMasterQosUpdateMasterLimitReq: "master:qosupdatemasterlimit",
MsgMasterQosUpdateClientParamReq: "master:qosupdateclientparam",
// Master API data partition management
MsgMasterCreateDataPartitionReq: "master:createdatapartition",
MsgMasterDataPartitionChangeLeaderReq: "master:changedatapartitionleader",
MsgMasterLoadDataPartitionReq: "master:loaddatapartition",
MsgMasterDecommissionDataPartitionReq: "master:decommissiondatapartition",
MsgMasterAddDataReplicaReq: "master:adddatareplica",
MsgMasterDeleteDataReplicaReq: "master:removedatareplica",
MsgMasterSetDpRdOnlyReq: "master:setdprdonly",
MsgMasterReportLackDataPartitions: "master:reportLackDataPartitions",
// Master API meta node management
MsgMasterAddMetaNodeReq: "master:addmetanode",
MsgMasterDecommissionMetaNodeReq: "master:decommissionmetanode",
MsgMasterMigrateMetaNodeReq: "master:migratemetanode",
MsgMasterSetMetaNodeThresholdReq: "master:setmetanodethreshold",
MsgMasterUpdateMetaNodeReq: "master:updatemetanode",
// Master API data node management
MsgMasterAddDataNodeReq: "master:adddatannode",
MsgMasterDecommissionDataNodeReq: "master:decommissiondatannode",
MsgMasterMigrateDataNodeReq: "master:migratedatannode",
MsgMasterCancelDecommissionDataNodeReq: "master:canceldecommissiondatannode",
MsgMasterDecommissionDiskReq: "master:decommissiondisk",
MsgMasterUpdateNodeSetCapcityReq: "master:updatenodesetcapcity",
MsgMasterUpdateNodeSetIdReq: "master:updatenodesetid",
MsgMasterUpdateDomainDataUseRatioReq: "master:updatedomaindatauseratio",
MsgMasterUpdateZoneExcludeRatioReq: "master:updatezoneexcluderatio",
MsgMasterRecommissionDiskReq: "master:recommissiondisk",
// Master API user management
MsgMasterUserCreateReq: "master:usercreate",
MsgMasterUserDeleteReq: "master:userdelete",
MsgMasterUserUpdateReq: "master:userupdate",
MsgMasterUserUpdatePolicyReq: "master:userupdatepolicy",
MsgMasterUserRemovePolicyReq: "master:userremotepolicy",
MsgMasterUserDeleteVolPolicyReq: "master:userdeletevolpolicy",
MsgMasterUserTransferVolReq: "master:usertransfervol",
// Master API zone management
MsgMasterUpdateZoneReq: "master:updatezone",
}
// AuthGetTicketReq defines the message from client to authnode
// use Timestamp as verifier for MITM mitigation
// verifier is also used to verify the server identity
type AuthGetTicketReq struct {
Type MsgType `json:"type"`
ClientID string `json:"client_id"`
ServiceID string `json:"service_id"`
Verifier string `json:"verifier"`
}
// AuthGetTicketResp defines the message from authnode to client
type AuthGetTicketResp struct {
Type MsgType `json:"type"`
ClientID string `json:"client_id"`
ServiceID string `json:"service_id"`
Verifier int64 `json:"verifier"`
Ticket string `json:"ticket"`
SessionKey cryptoutil.CryptoKey `json:"session_key"`
}
// APIAccessReq defines the request for access restful api
// use Timestamp as verifier for MITM mitigation
// verifier is also used to verify the server identity
type APIAccessReq struct {
Type MsgType `json:"type"`
ClientID string `json:"client_id"`
ServiceID string `json:"service_id"`
Verifier string `json:"verifier"`
Ticket string `json:"ticket"`
}
// APIAccessResp defines the response for access restful api
// use Timestamp as verifier for MITM mitigation
// verifier is also used to verify the server identity
type APIAccessResp struct {
Type MsgType `json:"type"`
ClientID string `json:"client_id"`
ServiceID string `json:"service_id"`
Verifier int64 `json:"verifier"`
}
// AuthAPIAccessReq defines Auth API request
type AuthAPIAccessReq struct {
APIReq APIAccessReq `json:"api_req"`
KeyInfo keystore.KeyInfo `json:"key_info"`
}
// AuthAPIAccessResp defines the response for creating an key in authnode
type AuthAPIAccessResp struct {
APIResp APIAccessResp `json:"api_resp"`
KeyInfo keystore.KeyInfo `json:"key_info"`
AuthIDKey string `json:"auth_id_key"`
}
// AuthRaftNodeInfo defines raft node information
type AuthRaftNodeInfo struct {
ID uint64 `json:"id"`
Addr string `json:"addr"`
}
// AuthRaftNodeReq defines Auth API request for add/remove a raft node
type AuthRaftNodeReq struct {
APIReq APIAccessReq `json:"api_req"`
RaftNodeInfo AuthRaftNodeInfo `json:"node_info"`
}
// AuthRaftNodeResp defines Auth API response for add/remove a raft node
type AuthRaftNodeResp struct {
APIResp APIAccessResp `json:"api_resp"`
Msg string `json:"msg"`
}
// AuthAPIAccessKeystoreReq defines Auth API for put/delete Access Keystore vols
type AuthOSAccessKeyReq struct {
APIReq APIAccessReq `json:"api_req"`
AKCaps keystore.AccessKeyCaps `json:"access_key_caps"`
}
// AuthAPIAccessKeystoreResp defines the response for put/delete Access Keystore vols
type AuthOSAccessKeyResp struct {
APIResp APIAccessResp `json:"api_resp"`
AKCaps keystore.AccessKeyCaps `json:"access_key_caps"`
}
// IsValidServiceID determine the validity of a serviceID
func IsValidServiceID(serviceID string) (err error) {
if serviceID != AuthServiceID && serviceID != MasterServiceID && serviceID != MetaServiceID && serviceID != DataServiceID {
err = fmt.Errorf("invalid service ID [%s]", serviceID)
return
}
return
}
// IsValidMsgReqType determine the validity of a message type
func IsValidMsgReqType(serviceID string, msgType MsgType) (err error) {
b := false
switch serviceID {
case "AuthService":
fallthrough
case "MasterService":
if msgType|MsgAuthBase != 0 {
b = true
}
default:
// do nothing
}
if !b {
err = fmt.Errorf("invalid request type [%x] and serviceID[%s]", msgType, serviceID)
return
}
return
}
// IsValidClientID determine the validity of a clientID
func IsValidClientID(id string) (err error) {
re := regexp.MustCompile("^[A-Za-z]{1,1}[A-Za-z0-9_]{0,20}$")
if !re.MatchString(id) {
err = fmt.Errorf("clientID invalid format [%s]", id)
return
}
return
}
// ParseAuthReply parse the response from auth
func ParseAuthReply(body []byte) (jobj HTTPAuthReply, err error) {
if err = json.Unmarshal(body, &jobj); err != nil {
return
}
if jobj.Code != 0 {
err = fmt.Errorf(jobj.Msg)
return
}
return
}
// GetDataFromResp extract data from response
func GetDataFromResp(body []byte, key []byte) (plaintext []byte, err error) {
jobj, err := ParseAuthReply(body)
if err != nil {
return
}
data := fmt.Sprint(jobj.Data)
if plaintext, err = cryptoutil.DecodeMessage(data, key); err != nil {
return
}
return
}
// ParseAuthGetTicketResp parse and validate the auth get ticket resp
func ParseAuthGetTicketResp(body []byte, key []byte) (resp AuthGetTicketResp, err error) {
var plaintext []byte
if plaintext, err = GetDataFromResp(body, key); err != nil {
return
}
if err = json.Unmarshal(plaintext, &resp); err != nil {
return
}
return
}
// ParseAuthAPIAccessResp parse and validate the auth api access resp
func ParseAuthAPIAccessResp(body []byte, key []byte) (resp AuthAPIAccessResp, err error) {
var plaintext []byte
if plaintext, err = GetDataFromResp(body, key); err != nil {
return
}
if err = json.Unmarshal(plaintext, &resp); err != nil {
return
}
return
}
// ParseAuthRaftNodeResp parse and validate the auth raft node resp
func ParseAuthRaftNodeResp(body []byte, key []byte) (resp AuthRaftNodeResp, err error) {
var plaintext []byte
if plaintext, err = GetDataFromResp(body, key); err != nil {
return
}
if err = json.Unmarshal(plaintext, &resp); err != nil {
return
}
return
}
func ParseAuthOSAKResp(body []byte, key []byte) (resp AuthOSAccessKeyResp, err error) {
var plaintext []byte
if plaintext, err = GetDataFromResp(body, key); err != nil {
return
}
if err = json.Unmarshal(plaintext, &resp); err != nil {
return
}
return
}
func ExtractTicket(str string, key []byte) (ticket cryptoutil.Ticket, err error) {
var plaintext []byte
if plaintext, err = cryptoutil.DecodeMessage(str, key); err != nil {
return
}
if err = json.Unmarshal(plaintext, &ticket); err != nil {
return
}
return
}
func checkTicketCaps(ticket *cryptoutil.Ticket, kind string, cap string) (err error) {
c := new(caps.Caps)
if err = c.Init(ticket.Caps); err != nil {
return
}
if b := c.ContainCaps(kind, cap); !b {
err = fmt.Errorf("no permission to access %v", kind)
return
}
return
}
// ParseVerifier checks the verifier structure for replay attack mitigation
func ParseVerifier(verifier string, key []byte) (ts int64, err error) {
var plainttext []byte
if plainttext, err = cryptoutil.DecodeMessage(verifier, key); err != nil {
return
}
ts = int64(binary.LittleEndian.Uint64(plainttext))
if time.Now().Unix()-ts >= reqLiveLength { // mitigate replay attack
err = fmt.Errorf("req verifier is timeout [%d] >= [%d]", time.Now().Unix()-ts, reqLiveLength)
return
}
return
}
// VerifyAPIAccessReqIDs verify the req IDs
func VerifyAPIAccessReqIDs(req *APIAccessReq) (err error) {
if err = IsValidClientID(req.ClientID); err != nil {
err = fmt.Errorf("IsValidClientID failed: %s", err.Error())
return
}
if err = IsValidServiceID(req.ServiceID); err != nil {
err = fmt.Errorf("IsValidServiceID failed: %s", err.Error())
return
}
if err = IsValidMsgReqType(req.ServiceID, req.Type); err != nil {
err = fmt.Errorf("IsValidMsgReqType failed: %s", err.Error())
return
}
return
}
// ExtractAPIAccessTicket verify ticket validity
func ExtractAPIAccessTicket(req *APIAccessReq, key []byte) (ticket cryptoutil.Ticket, ts int64, err error) {
if ticket, err = ExtractTicket(req.Ticket, key); err != nil {
err = fmt.Errorf("extractTicket failed: %s", err.Error())
return
}
if time.Now().Unix() >= ticket.Exp {
err = ErrExpiredTicket
return
}
if ts, err = ParseVerifier(req.Verifier, ticket.SessionKey.Key); err != nil {
err = fmt.Errorf("parseVerifier failed: %s", err.Error())
return
}
return
}
// CheckAPIAccessCaps checks capability
func CheckAPIAccessCaps(ticket *cryptoutil.Ticket, rscType string, mp MsgType, action string) (err error) {
if _, ok := MsgType2ResourceMap[mp]; !ok {
err = fmt.Errorf("MsgType2ResourceMap key not found [%d]", mp)
return
}
rule := MsgType2ResourceMap[mp] + capSeparator + action
if err = checkTicketCaps(ticket, rscType, rule); err != nil {
err = fmt.Errorf("checkTicketCaps failed: %s", err.Error())
return
}
return
}
func GenAuthIDKey(id string, authKey []byte) (authIDKey string, err error) {
type AuthIDKey struct {
ID string `json:"id"`
AuthKey []byte `json:"auth_key"`
}
tmpAuthIDKey := AuthIDKey{
ID: id,
AuthKey: authKey,
}
var jAuthIDKey []byte
if jAuthIDKey, err = json.Marshal(tmpAuthIDKey); err != nil {
err = fmt.Errorf("json marshal authIDKey failed %s", err.Error())
return
}
authIDKey = cryptoutil.Base64Encode(jAuthIDKey)
return
}
func ExtractIDAndAuthKey(authIDKey string) (id string, authKey []byte, err error) {
type AuthIDKey struct {
ID string `json:"id"`
AuthKey string `json:"auth_key"`
}
var jAuthIDKey []byte
jAuthIDKey, err = cryptoutil.Base64Decode(authIDKey)
if err != nil {
err = fmt.Errorf("decode authIDKey failed %s", err.Error())
return
}
tmpAuthIDKey := &AuthIDKey{}
if err = json.Unmarshal(jAuthIDKey, &tmpAuthIDKey); err != nil {
err = fmt.Errorf("json unmarshal authIDKey failed %s", err.Error())
return
}
id = tmpAuthIDKey.ID
authKey = []byte(tmpAuthIDKey.AuthKey)
return
}
func CheckVOLAccessCaps(ticket *cryptoutil.Ticket, volName string, action string, accessNode string) (err error) {
rule := accessNode + capSeparator + volName + capSeparator + action
if err = checkTicketCaps(ticket, OwnerVOLRsc, rule); err != nil {
if err = checkTicketCaps(ticket, NoneOwnerVOLRsc, rule); err != nil {
err = fmt.Errorf("checkTicketCaps failed: %s", err.Error())
return
}
}
return
}
// VerifyAPIRespComm client verifies commond attributes returned from server
func VerifyAPIRespComm(apiResp *APIAccessResp, msg MsgType, clientID string, serviceID string, ts int64) (err error) {
if ts+1 != apiResp.Verifier {
err = fmt.Errorf("verifier verification failed")
return
}
if apiResp.Type != msg+1 {
err = fmt.Errorf("msg verification failed")
return
}
if apiResp.ClientID != clientID {
err = fmt.Errorf("id verification failed")
return
}
if apiResp.ServiceID != serviceID {
err = fmt.Errorf("service id verification failed")
return
}
return
}
// VerifyTicketRespComm verifies the ticket respose from server
func VerifyTicketRespComm(ticketResp *AuthGetTicketResp, msg MsgType, clientID string, serviceID string, ts int64) (err error) {
if ts+1 != ticketResp.Verifier {
err = fmt.Errorf("verifier verification failed")
return
}
if ticketResp.Type != msg+1 {
err = fmt.Errorf("msg verification failed")
return
}
if ticketResp.ClientID != clientID {
err = fmt.Errorf("id verification failed")
return
}
if ticketResp.ServiceID != serviceID {
err = fmt.Errorf("service id verification failed")
return
}
return
}
// SendBytes send raw bytes target in http/https protocol
func SendBytes(client *http.Client, target string, data []byte) (res []byte, err error) {
message := base64.StdEncoding.EncodeToString(data)
resp, err := client.PostForm(target, url.Values{ClientMessage: {message}})
if err != nil {
err = fmt.Errorf("action[SendData] failed:" + err.Error())
return
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
err = fmt.Errorf("action[doRealSend] failed:" + err.Error())
return
}
res = body
return
}
// SendData sends data to target
func SendData(client *http.Client, target string, data interface{}) (res []byte, err error) {
messageJSON, err := json.Marshal(data)
if err != nil {
err = fmt.Errorf("action[doRealSend] failed:" + err.Error())
return
}
if res, err = SendBytes(client, target, messageJSON); err != nil {
return
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import "github.com/cubefs/cubefs/util/errors"
// err
var (
ErrSuc = errors.New("success")
ErrInternalError = errors.New("internal error")
ErrParamError = errors.New("parameter error")
ErrInvalidCfg = errors.New("bad configuration file")
ErrPersistenceByRaft = errors.New("persistence by raft occurred error")
ErrMarshalData = errors.New("marshal data error")
ErrUnmarshalData = errors.New("unmarshal data error")
ErrVolNotExists = errors.New("vol not exists")
ErrMetaPartitionNotExists = errors.New("meta partition not exists")
ErrDataPartitionNotExists = errors.New("data partition not exists")
ErrDataNodeNotExists = errors.New("data node not exists")
ErrMetaNodeNotExists = errors.New("meta node not exists")
ErrDuplicateVol = errors.New("duplicate vol")
ErrActiveDataNodesTooLess = errors.New("no enough active data node")
ErrActiveMetaNodesTooLess = errors.New("no enough active meta node")
ErrInvalidMpStart = errors.New("invalid meta partition start value")
ErrNoAvailDataPartition = errors.New("no available data partition")
ErrReshuffleArray = errors.New("the array to be reshuffled is nil")
ErrIllegalDataReplica = errors.New("data replica is illegal")
ErrMissingReplica = errors.New("a missing data replica is found")
ErrHasOneMissingReplica = errors.New("there is a missing replica")
ErrNoDataNodeToWrite = errors.New("No data node available for creating a data partition")
ErrNoMetaNodeToWrite = errors.New("No meta node available for creating a meta partition")
ErrCannotBeOffLine = errors.New("cannot take the data replica offline")
ErrNoDataNodeToCreateDataPartition = errors.New("no enough data nodes for creating a data partition")
ErrNoZoneToCreateDataPartition = errors.New("no zone available for creating a data partition")
ErrNoZoneToCreateMetaPartition = errors.New("no zone available for creating a meta partition")
ErrNoNodeSetToCreateDataPartition = errors.New("no node set available for creating a data partition")
ErrNoNodeSetToCreateMetaPartition = errors.New("no node set available for creating a meta partition")
ErrNoMetaNodeToCreateMetaPartition = errors.New("no enough meta nodes for creating a meta partition")
ErrIllegalMetaReplica = errors.New("illegal meta replica")
ErrNoEnoughReplica = errors.New("no enough replicas")
ErrNoLeader = errors.New("no leader")
ErrVolAuthKeyNotMatch = errors.New("client and server auth key do not match")
ErrAuthKeyStoreError = errors.New("auth keystore error")
ErrAuthAPIAccessGenRespError = errors.New("auth API access response error")
ErrAuthOSCapsOpGenRespError = errors.New("auth Object Storage node API response error")
ErrKeyNotExists = errors.New("key not exists")
ErrDuplicateKey = errors.New("duplicate key")
ErrAccessKeyNotExists = errors.New("access key not exists")
ErrInvalidTicket = errors.New("invalid ticket")
ErrInvalidClientIDKey = errors.New("invalid clientIDKey")
ErrExpiredTicket = errors.New("expired ticket")
ErrMasterAPIGenRespError = errors.New("master API generate response error")
ErrDuplicateUserID = errors.New("duplicate user id")
ErrUserNotExists = errors.New("user not exists")
ErrReadBodyError = errors.New("read request body failed")
ErrVolPolicyNotExists = errors.New("vol policy not exists")
ErrDuplicateAccessKey = errors.New("duplicate access key")
ErrHaveNoPolicy = errors.New("no vol policy")
ErrZoneNotExists = errors.New("zone not exists")
ErrOwnVolExists = errors.New("own vols not empty")
ErrSuperAdminExists = errors.New("super administrator exists ")
ErrInvalidUserID = errors.New("invalid user ID")
ErrInvalidUserType = errors.New("invalid user type")
ErrNoPermission = errors.New("no permission")
ErrTokenNotFound = errors.New("token not found")
ErrInvalidAccessKey = errors.New("invalid access key")
ErrInvalidSecretKey = errors.New("invalid secret key")
ErrIsOwner = errors.New("user owns the volume")
ErrZoneNum = errors.New("zone num not qualified")
ErrNoNodeSetToUpdateDecommissionLimit = errors.New("no node set available for updating decommission limit")
ErrNoNodeSetToQueryDecommissionLimitStatus = errors.New("no node set available for query decommission limit status")
ErrNoNodeSetToDecommission = errors.New("no node set available to decommission ")
ErrVolNoAvailableSpace = errors.New("vol has no available space")
ErrVolNoCacheAndRule = errors.New("vol has no cache and rule")
ErrNoAclPermission = errors.New("acl no permission")
ErrQuotaNotExists = errors.New("quota not exists")
ErrCodeVersionOp = errors.New("version op failed")
ErrNoSuchLifecycleConfiguration = errors.New("The lifecycle configuration does not exist")
ErrNoNodeSetToUpdateDecommissionDiskFactor = errors.New("no node set available for updating decommission disk factor")
ErrNoNodeSetToQueryDecommissionDiskLimit = errors.New("no node set available for query decommission disk limit")
ErrNodeSetNotExists = errors.New("node set not exists")
ErrCompressFailed = errors.New("compress data failed")
ErrDecompressFailed = errors.New("decompress data failed")
)
// http response error code and error message definitions
const (
ErrCodeSuccess = iota
ErrCodeInternalError
ErrCodeParamError
ErrCodeInvalidCfg
ErrCodePersistenceByRaft
ErrCodeMarshalData
ErrCodeUnmarshalData
ErrCodeVolNotExists
ErrCodeMetaPartitionNotExists
ErrCodeDataPartitionNotExists
ErrCodeDataNodeNotExists
ErrCodeMetaNodeNotExists
ErrCodeDuplicateVol
ErrCodeActiveDataNodesTooLess
ErrCodeActiveMetaNodesTooLess
ErrCodeInvalidMpStart
ErrCodeNoAvailDataPartition
ErrCodeReshuffleArray
ErrCodeIllegalDataReplica
ErrCodeMissingReplica
ErrCodeHasOneMissingReplica
ErrCodeNoDataNodeToWrite
ErrCodeNoMetaNodeToWrite
ErrCodeCannotBeOffLine
ErrCodeNoDataNodeToCreateDataPartition
ErrCodeNoZoneToCreateDataPartition
ErrCodeNoNodeSetToCreateDataPartition
ErrCodeNoNodeSetToCreateMetaPartition
ErrCodeNoMetaNodeToCreateMetaPartition
ErrCodeIllegalMetaReplica
ErrCodeNoEnoughReplica
ErrCodeNoLeader
ErrCodeVolAuthKeyNotMatch
ErrCodeAuthKeyStoreError
ErrCodeAuthAPIAccessGenRespError
ErrCodeAuthRaftNodeGenRespError
ErrCodeAuthOSCapsOpGenRespError
ErrCodeAuthReqRedirectError
ErrCodeAccessKeyNotExists
ErrCodeInvalidTicket
ErrCodeInvalidClientIDKey
ErrCodeExpiredTicket
ErrCodeMasterAPIGenRespError
ErrCodeDuplicateUserID
ErrCodeUserNotExists
ErrCodeReadBodyError
ErrCodeVolPolicyNotExists
ErrCodeDuplicateAccessKey
ErrCodeHaveNoPolicy
ErrCodeNoZoneToCreateMetaPartition
ErrCodeZoneNotExists
ErrCodeOwnVolExists
ErrCodeSuperAdminExists
ErrCodeInvalidUserID
ErrCodeInvalidUserType
ErrCodeNoPermission
ErrCodeTokenNotExist
ErrCodeInvalidAccessKey
ErrCodeInvalidSecretKey
ErrCodeIsOwner
ErrCodeZoneNumError
ErrCodeVersionOpError
ErrCodeNodeSetNotExists
)
// Err2CodeMap error map to code
var Err2CodeMap = map[error]int32{
ErrSuc: ErrCodeSuccess,
ErrInternalError: ErrCodeInternalError,
ErrParamError: ErrCodeParamError,
ErrInvalidCfg: ErrCodeInvalidCfg,
ErrPersistenceByRaft: ErrCodePersistenceByRaft,
ErrMarshalData: ErrCodeMarshalData,
ErrUnmarshalData: ErrCodeUnmarshalData,
ErrVolNotExists: ErrCodeVolNotExists,
ErrMetaPartitionNotExists: ErrCodeMetaPartitionNotExists,
ErrDataPartitionNotExists: ErrCodeDataPartitionNotExists,
ErrDataNodeNotExists: ErrCodeDataNodeNotExists,
ErrMetaNodeNotExists: ErrCodeMetaNodeNotExists,
ErrDuplicateVol: ErrCodeDuplicateVol,
ErrActiveDataNodesTooLess: ErrCodeActiveDataNodesTooLess,
ErrActiveMetaNodesTooLess: ErrCodeActiveMetaNodesTooLess,
ErrInvalidMpStart: ErrCodeInvalidMpStart,
ErrNoAvailDataPartition: ErrCodeNoAvailDataPartition,
ErrReshuffleArray: ErrCodeReshuffleArray,
ErrIllegalDataReplica: ErrCodeIllegalDataReplica,
ErrMissingReplica: ErrCodeMissingReplica,
ErrHasOneMissingReplica: ErrCodeHasOneMissingReplica,
ErrNoDataNodeToWrite: ErrCodeNoDataNodeToWrite,
ErrNoMetaNodeToWrite: ErrCodeNoMetaNodeToWrite,
ErrCannotBeOffLine: ErrCodeCannotBeOffLine,
ErrNoDataNodeToCreateDataPartition: ErrCodeNoDataNodeToCreateDataPartition,
ErrNoZoneToCreateDataPartition: ErrCodeNoZoneToCreateDataPartition,
ErrNoZoneToCreateMetaPartition: ErrCodeNoZoneToCreateMetaPartition,
ErrNoNodeSetToCreateDataPartition: ErrCodeNoNodeSetToCreateDataPartition,
ErrNoNodeSetToCreateMetaPartition: ErrCodeNoNodeSetToCreateMetaPartition,
ErrNoMetaNodeToCreateMetaPartition: ErrCodeNoMetaNodeToCreateMetaPartition,
ErrIllegalMetaReplica: ErrCodeIllegalMetaReplica,
ErrNoEnoughReplica: ErrCodeNoEnoughReplica,
ErrNoLeader: ErrCodeNoLeader,
ErrVolAuthKeyNotMatch: ErrCodeVolAuthKeyNotMatch,
ErrAuthKeyStoreError: ErrCodeAuthKeyStoreError,
ErrAuthAPIAccessGenRespError: ErrCodeAuthAPIAccessGenRespError,
ErrAuthOSCapsOpGenRespError: ErrCodeAuthOSCapsOpGenRespError,
ErrAccessKeyNotExists: ErrCodeAccessKeyNotExists,
ErrInvalidTicket: ErrCodeInvalidTicket,
ErrInvalidClientIDKey: ErrCodeInvalidClientIDKey,
ErrExpiredTicket: ErrCodeExpiredTicket,
ErrMasterAPIGenRespError: ErrCodeMasterAPIGenRespError,
ErrDuplicateUserID: ErrCodeDuplicateUserID,
ErrUserNotExists: ErrCodeUserNotExists,
ErrReadBodyError: ErrCodeReadBodyError,
ErrVolPolicyNotExists: ErrCodeVolPolicyNotExists,
ErrDuplicateAccessKey: ErrCodeDuplicateAccessKey,
ErrHaveNoPolicy: ErrCodeHaveNoPolicy,
ErrZoneNotExists: ErrCodeZoneNotExists,
ErrOwnVolExists: ErrCodeOwnVolExists,
ErrSuperAdminExists: ErrCodeSuperAdminExists,
ErrInvalidUserID: ErrCodeInvalidUserID,
ErrInvalidUserType: ErrCodeInvalidUserType,
ErrNoPermission: ErrCodeNoPermission,
ErrTokenNotFound: ErrCodeTokenNotExist,
ErrInvalidAccessKey: ErrCodeInvalidAccessKey,
ErrInvalidSecretKey: ErrCodeInvalidSecretKey,
ErrIsOwner: ErrCodeIsOwner,
ErrZoneNum: ErrCodeZoneNumError,
ErrCodeVersionOp: ErrCodeVersionOpError,
ErrNodeSetNotExists: ErrCodeNodeSetNotExists,
}
func ParseErrorCode(code int32) error {
if err, exist := code2ErrMap[code]; exist {
return err
}
return ErrInternalError
}
// Code2ErrMap error map to code
var code2ErrMap = map[int32]error{
ErrCodeSuccess: ErrSuc,
ErrCodeInternalError: ErrInternalError,
ErrCodeParamError: ErrParamError,
ErrCodeInvalidCfg: ErrInvalidCfg,
ErrCodePersistenceByRaft: ErrPersistenceByRaft,
ErrCodeMarshalData: ErrMarshalData,
ErrCodeUnmarshalData: ErrUnmarshalData,
ErrCodeVolNotExists: ErrVolNotExists,
ErrCodeMetaPartitionNotExists: ErrMetaPartitionNotExists,
ErrCodeDataPartitionNotExists: ErrDataPartitionNotExists,
ErrCodeDataNodeNotExists: ErrDataNodeNotExists,
ErrCodeMetaNodeNotExists: ErrMetaNodeNotExists,
ErrCodeDuplicateVol: ErrDuplicateVol,
ErrCodeActiveDataNodesTooLess: ErrActiveDataNodesTooLess,
ErrCodeActiveMetaNodesTooLess: ErrActiveMetaNodesTooLess,
ErrCodeInvalidMpStart: ErrInvalidMpStart,
ErrCodeNoAvailDataPartition: ErrNoAvailDataPartition,
ErrCodeReshuffleArray: ErrReshuffleArray,
ErrCodeIllegalDataReplica: ErrIllegalDataReplica,
ErrCodeMissingReplica: ErrMissingReplica,
ErrCodeHasOneMissingReplica: ErrHasOneMissingReplica,
ErrCodeNoDataNodeToWrite: ErrNoDataNodeToWrite,
ErrCodeNoMetaNodeToWrite: ErrNoMetaNodeToWrite,
ErrCodeCannotBeOffLine: ErrCannotBeOffLine,
ErrCodeNoDataNodeToCreateDataPartition: ErrNoDataNodeToCreateDataPartition,
ErrCodeNoZoneToCreateDataPartition: ErrNoZoneToCreateDataPartition,
ErrCodeNoZoneToCreateMetaPartition: ErrNoZoneToCreateMetaPartition,
ErrCodeNoNodeSetToCreateDataPartition: ErrNoNodeSetToCreateDataPartition,
ErrCodeNoNodeSetToCreateMetaPartition: ErrNoNodeSetToCreateMetaPartition,
ErrCodeNoMetaNodeToCreateMetaPartition: ErrNoMetaNodeToCreateMetaPartition,
ErrCodeIllegalMetaReplica: ErrIllegalMetaReplica,
ErrCodeNoEnoughReplica: ErrNoEnoughReplica,
ErrCodeNoLeader: ErrNoLeader,
ErrCodeVolAuthKeyNotMatch: ErrVolAuthKeyNotMatch,
ErrCodeAuthKeyStoreError: ErrAuthKeyStoreError,
ErrCodeAuthAPIAccessGenRespError: ErrAuthAPIAccessGenRespError,
ErrCodeAuthOSCapsOpGenRespError: ErrAuthOSCapsOpGenRespError,
ErrCodeAccessKeyNotExists: ErrAccessKeyNotExists,
ErrCodeInvalidTicket: ErrInvalidTicket,
ErrCodeInvalidClientIDKey: ErrInvalidClientIDKey,
ErrCodeExpiredTicket: ErrExpiredTicket,
ErrCodeMasterAPIGenRespError: ErrMasterAPIGenRespError,
ErrCodeDuplicateUserID: ErrDuplicateUserID,
ErrCodeUserNotExists: ErrUserNotExists,
ErrCodeReadBodyError: ErrReadBodyError,
ErrCodeVolPolicyNotExists: ErrVolPolicyNotExists,
ErrCodeDuplicateAccessKey: ErrDuplicateAccessKey,
ErrCodeHaveNoPolicy: ErrHaveNoPolicy,
ErrCodeZoneNotExists: ErrZoneNotExists,
ErrCodeOwnVolExists: ErrOwnVolExists,
ErrCodeSuperAdminExists: ErrSuperAdminExists,
ErrCodeInvalidUserType: ErrInvalidUserType,
ErrCodeInvalidUserID: ErrInvalidUserID,
ErrCodeNoPermission: ErrNoPermission,
ErrCodeTokenNotExist: ErrTokenNotFound,
ErrCodeInvalidAccessKey: ErrInvalidAccessKey,
ErrCodeInvalidSecretKey: ErrInvalidSecretKey,
ErrCodeIsOwner: ErrIsOwner,
ErrCodeZoneNumError: ErrZoneNum,
ErrCodeVersionOpError: ErrCodeVersionOp,
ErrCodeNodeSetNotExists: ErrNodeSetNotExists,
}
type GeneralResp struct {
Message string
Code int32
}
func Success(msg string) *GeneralResp {
return &GeneralResp{Message: msg, Code: ErrCodeSuccess}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"math"
"sync"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/log"
)
var (
ExtentKeyHeader = []byte("EKV2")
ExtentKeyHeaderV3 = []byte("EKV3")
ExtentKeyHeaderSize = len(ExtentKeyHeader)
ExtentLength = 40
ExtentKeyChecksumSize = 4
ExtentVerFieldSize = 9 // ver(8) and isSplit(1)
ExtentV2Length = ExtentKeyHeaderSize + ExtentLength + ExtentKeyChecksumSize
ExtentV3Length = ExtentKeyHeaderSize + ExtentLength + ExtentKeyChecksumSize + ExtentVerFieldSize
InvalidKey = errors.New("invalid key error")
InvalidKeyHeader = errors.New("invalid extent v2 key header error")
InvalidKeyCheckSum = errors.New("invalid extent v2 key checksum error")
)
type ExtSnapInfo struct {
VerSeq uint64
IsSplit bool
ModGen uint64
}
// ExtentKey defines the extent key struct.
type ExtentKey struct {
FileOffset uint64 // offset in file
PartitionId uint64
ExtentId uint64
ExtentOffset uint64 // offset in extent like tiny extent offset large than 0,normal is 0
Size uint32 // real size that inode used on the extent,it's size may be part of extent real size, such as tinyExt
CRC uint32
// snapshot
SnapInfo *ExtSnapInfo
}
func (k *ExtentKey) GetModGen() uint64 {
if k.SnapInfo == nil {
return 0
}
return k.SnapInfo.ModGen
}
func (k *ExtentKey) AddModGen() {
if k.SnapInfo == nil {
k.SnapInfo = &ExtSnapInfo{
ModGen: 1,
}
return
}
k.SnapInfo.ModGen++
}
func (k *ExtentKey) Equals(ek *ExtentKey) bool {
if k == nil && ek == nil {
return true
} else if k == nil || ek == nil {
return false
}
if k.PartitionId != ek.PartitionId ||
k.Size != ek.Size ||
k.ExtentOffset != ek.ExtentOffset ||
k.FileOffset != ek.FileOffset ||
k.ExtentId != ek.ExtentId ||
k.CRC != ek.CRC {
return false
}
if k.SnapInfo == nil && ek.SnapInfo == nil {
return true
} else if k.SnapInfo == nil || ek.SnapInfo == nil {
return false
}
return k.SnapInfo.IsSplit == ek.SnapInfo.IsSplit &&
k.SnapInfo.VerSeq == ek.SnapInfo.VerSeq
}
func (k *ExtentKey) IsSplit() bool {
if k.SnapInfo == nil {
return false
}
return k.SnapInfo.IsSplit
}
func (k *ExtentKey) GetSeq() uint64 {
if k.SnapInfo == nil {
return 0
}
return k.SnapInfo.VerSeq
}
func (k *ExtentKey) SetSeq(seq uint64) {
if seq == 0 && k.SnapInfo == nil {
return
}
if k.SnapInfo == nil {
k.SnapInfo = &ExtSnapInfo{VerSeq: seq}
return
}
k.SnapInfo.VerSeq = seq
}
func (k *ExtentKey) SetSplit(split bool) {
if !split && k.SnapInfo == nil {
return
}
if k.SnapInfo == nil {
k.SnapInfo = &ExtSnapInfo{
IsSplit: split,
}
return
}
k.SnapInfo.IsSplit = split
}
func (k *ExtentKey) GenerateId() uint64 {
if k.PartitionId > math.MaxUint32 || k.ExtentId > math.MaxUint32 {
log.LogFatalf("ext %v abnormal", k)
}
return (k.PartitionId << 32) | k.ExtentId
}
func ParseFromId(sID uint64) (dpID uint64, extID uint64) {
dpID = sID >> 32
extID = sID & math.MaxUint32
return
}
func MergeSplitKey(inodeID uint64, ekRefMap *sync.Map, sMap *sync.Map) (err error) {
if ekRefMap == nil || sMap == nil {
log.LogErrorf("MergeSplitKey. inodeID %v should not use nil role", inodeID)
return
}
sMap.Range(func(id, value interface{}) bool {
dpID, extID := ParseFromId(id.(uint64))
nVal := uint32(0)
val, ok := ekRefMap.Load(id)
if ok {
nVal = val.(uint32)
}
log.LogDebugf("UnmarshalInodeValue inode %v get splitID %v dp id %v extentid %v, refCnt %v, add %v",
inodeID, id.(uint64), dpID, extID, value.(uint32), nVal)
ekRefMap.Store(id, nVal+value.(uint32))
return true
})
return
}
func (k *ExtentKey) IsEqual(rightKey *ExtentKey) bool {
// return false
return k.PartitionId == rightKey.PartitionId &&
k.ExtentId == rightKey.ExtentId &&
k.GetSeq() == rightKey.GetSeq() &&
k.ExtentOffset == rightKey.ExtentOffset &&
k.FileOffset == rightKey.FileOffset
}
func (k *ExtentKey) IsCoveredWithDiffSeq(rightKey *ExtentKey) bool {
return k.PartitionId == rightKey.PartitionId &&
k.ExtentId == rightKey.ExtentId &&
k.GetSeq() < rightKey.GetSeq() &&
k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset &&
k.FileOffset+uint64(k.Size) == rightKey.FileOffset
}
func (k *ExtentKey) IsSequenceWithSameSeq(rightKey *ExtentKey) bool {
return k.PartitionId == rightKey.PartitionId &&
k.ExtentId == rightKey.ExtentId &&
k.GetSeq() == rightKey.GetSeq() &&
k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset &&
k.FileOffset+uint64(k.Size) == rightKey.FileOffset
}
func (k *ExtentKey) IsSameExtent(rightKey *ExtentKey) bool {
return k.PartitionId == rightKey.PartitionId &&
k.ExtentId == rightKey.ExtentId
}
func (k *ExtentKey) IsSequenceWithDiffSeq(rightKey *ExtentKey) bool {
return k.PartitionId == rightKey.PartitionId &&
k.ExtentId == rightKey.ExtentId &&
!(k.GetSeq() == rightKey.GetSeq()) &&
k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset &&
k.FileOffset+uint64(k.Size) == rightKey.FileOffset
}
func (k *ExtentKey) IsFileInSequence(rightKey *ExtentKey) bool {
return k.PartitionId == rightKey.PartitionId &&
k.ExtentId == rightKey.ExtentId &&
k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset
}
// String returns the string format of the extentKey.
func (k ExtentKey) String() string {
return fmt.Sprintf("ExtentKey{FileOffset(%v),VerSeq(%v) Partition(%v),ExtentID(%v),ExtentOffset(%v),isSplit(%v),Size(%v),CRC(%v)}",
k.FileOffset, k.GetSeq(), k.PartitionId, k.ExtentId, k.ExtentOffset, k.IsSplit(), k.Size, k.CRC)
}
// Less defines the less comparator.
func (k *ExtentKey) Less(than btree.Item) bool {
that := than.(*ExtentKey)
return k.FileOffset < that.FileOffset
}
// Marshal marshals the extent key.
func (k *ExtentKey) Copy() btree.Item {
return k
}
func (k *ExtentKey) Marshal() (m string) {
return fmt.Sprintf("%v_%v_%v_%v_%v_%v", k.FileOffset, k.PartitionId, k.ExtentId, k.ExtentOffset, k.Size, k.CRC)
}
func (k *ExtentKey) MarshalBinaryExt(data []byte) {
binary.BigEndian.PutUint64(data[0:], k.FileOffset)
binary.BigEndian.PutUint64(data[8:], k.PartitionId)
binary.BigEndian.PutUint64(data[16:], k.ExtentId)
binary.BigEndian.PutUint64(data[24:], k.ExtentOffset)
binary.BigEndian.PutUint32(data[32:], k.Size)
binary.BigEndian.PutUint32(data[36:], k.CRC)
}
// MarshalBinary marshals the binary format of the extent key.
func (k *ExtentKey) MarshalBinary(v3 bool) ([]byte, error) {
buf := bytes.NewBuffer(make([]byte, 0, ExtentLength))
if err := binary.Write(buf, binary.BigEndian, k.FileOffset); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.PartitionId); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.ExtentId); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.ExtentOffset); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.Size); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.CRC); err != nil {
return nil, err
}
if v3 {
if err := binary.Write(buf, binary.BigEndian, k.GetSeq()); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.IsSplit()); err != nil {
return nil, err
}
}
return buf.Bytes(), nil
}
// UnmarshalBinary unmarshals the binary format of the extent key.
func (k *ExtentKey) UnmarshalBinary(buf *bytes.Buffer, v3 bool) (err error) {
if err = binary.Read(buf, binary.BigEndian, &k.FileOffset); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.PartitionId); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.ExtentId); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.ExtentOffset); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.Size); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.CRC); err != nil {
return
}
if v3 {
var seq uint64
if err = binary.Read(buf, binary.BigEndian, &seq); err != nil {
return
}
k.SetSeq(seq)
var isSplit bool
if err = binary.Read(buf, binary.BigEndian, &isSplit); err != nil {
return
}
k.SetSplit(isSplit)
}
return
}
func (k *ExtentKey) CheckSum(v3 bool) uint32 {
sign := crc32.NewIEEE()
buf, err := k.MarshalBinary(v3)
if err != nil {
log.LogErrorf("[ExtentKey] extentKey %v CRC32 error: %v", k, err)
return 0
}
sign.Write(buf)
return sign.Sum32()
}
// marshal extentkey to []bytes with v2 of magic head
func (k *ExtentKey) MarshalBinaryWithCheckSum(v3 bool) ([]byte, error) {
extLen := ExtentV2Length
flag := ExtentKeyHeader
if v3 {
extLen = ExtentV3Length
flag = ExtentKeyHeaderV3
}
buf := bytes.NewBuffer(make([]byte, 0, extLen))
if err := binary.Write(buf, binary.BigEndian, flag); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.FileOffset); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.PartitionId); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.ExtentId); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.ExtentOffset); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.Size); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.CRC); err != nil {
return nil, err
}
if v3 {
if err := binary.Write(buf, binary.BigEndian, k.GetSeq()); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.IsSplit()); err != nil {
return nil, err
}
}
if err := binary.Write(buf, binary.BigEndian, k.CheckSum(v3)); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
// unmarshal extentkey from bytes.Buffer with checksum
func (k *ExtentKey) UnmarshalBinaryWithCheckSum(buf *bytes.Buffer) (err error) {
var (
checksum uint32
v3 bool
)
magic := make([]byte, ExtentKeyHeaderSize)
if err = binary.Read(buf, binary.BigEndian, magic); err != nil {
return
}
if r := bytes.Compare(magic, ExtentKeyHeader); r != 0 {
if r = bytes.Compare(magic, ExtentKeyHeaderV3); r != 0 {
log.LogErrorf("action[UnmarshalBinaryWithCheckSum] err header magic %v", string(magic))
err = InvalidKeyHeader
return
}
v3 = true
}
if err = binary.Read(buf, binary.BigEndian, &k.FileOffset); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.PartitionId); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.ExtentId); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.ExtentOffset); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.Size); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.CRC); err != nil {
return
}
if v3 {
var seq uint64
if err = binary.Read(buf, binary.BigEndian, &seq); err != nil {
return
}
k.SetSeq(seq)
var split bool
if err = binary.Read(buf, binary.BigEndian, &split); err != nil {
return
}
k.SetSplit(split)
}
if err = binary.Read(buf, binary.BigEndian, &checksum); err != nil {
return
}
if k.CheckSum(v3) != checksum {
err = InvalidKeyCheckSum
return
}
return
}
// TODO remove
func (k *ExtentKey) UnMarshal(m string) (err error) {
_, err = fmt.Sscanf(m, "%v_%v_%v_%v_%v_%v", &k.FileOffset, &k.PartitionId, &k.ExtentId, &k.ExtentOffset, &k.Size, &k.CRC)
return
}
// TODO remove
func (k *ExtentKey) GetExtentKey() (m string) {
return fmt.Sprintf("%v_%v_%v_%v_%v", k.PartitionId, k.FileOffset, k.ExtentId, k.ExtentOffset, k.Size)
}
type TinyExtentDeleteRecord struct {
FileOffset uint64
PartitionId uint64
ExtentId uint64
ExtentOffset uint64
Size uint32
CRC uint32
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
const (
RootIno = uint64(1)
SummaryKey = "cbfs.dir.summary"
QuotaKey = "qa"
)
const (
FlagsSyncWrite int = 1 << iota
FlagsAppend
FlagsCache
)
const (
FlagsSnapshotDel int = 1 << iota
FlagsSnapshotDelDir
FlagsVerAll
)
// Mode returns the fileMode.
func Mode(osMode os.FileMode) uint32 {
return uint32(osMode)
}
// OsMode returns os.FileMode.
func OsMode(mode uint32) os.FileMode {
return os.FileMode(mode)
}
// Returns os.FileMode masked by os.ModeType
func OsModeType(mode uint32) os.FileMode {
return os.FileMode(mode) & os.ModeType
}
// IsRegular checks if the mode is regular.
func IsRegular(mode uint32) bool {
return OsMode(mode).IsRegular()
}
// IsDir checks if the mode is dir.
func IsDir(mode uint32) bool {
return OsMode(mode).IsDir()
}
// IsSymlink checks if the mode is symlink.
func IsSymlink(mode uint32) bool {
return OsMode(mode)&os.ModeSymlink != 0
}
func IsAncestor(parent, child string) bool {
rel, err := filepath.Rel(parent, child)
if err != nil {
return false
}
return !strings.HasPrefix(rel, "..")
}
// InodeInfo defines the inode struct.
type InodeInfo struct {
Inode uint64 `json:"ino"`
Mode uint32 `json:"mode"`
Nlink uint32 `json:"nlink"`
Size uint64 `json:"sz"`
Uid uint32 `json:"uid"`
Gid uint32 `json:"gid"`
Generation uint64 `json:"gen"`
ModifyTime time.Time `json:"mt"`
CreateTime time.Time `json:"ct"`
AccessTime time.Time `json:"at"`
Target []byte `json:"tgt"`
QuotaInfos map[uint32]*MetaQuotaInfo `json:"qifs"`
VerSeq uint64 `json:"seq"`
expiration int64
}
type SimpleExtInfo struct {
ID uint64
PartitionID uint32
ExtentID uint32
}
// InodeInfo defines the inode struct.
type InodeSplitInfo struct {
Inode uint64 `json:"ino"`
SplitArr []SimpleExtInfo `json:"splitInfo"`
VerSeq uint64 `json:"seq"`
}
type SummaryInfo struct {
Files int64 `json:"files"`
Subdirs int64 `json:"subdirs"`
Fbytes int64 `json:"fbytes"`
}
type DentryInfo struct {
Name string `json:"name"`
Inode uint64 `json:"inode"`
expiration int64
}
func (info *DentryInfo) SetExpiration(e int64) {
info.expiration = e
}
func (info *DentryInfo) Expiration() int64 {
return info.expiration
}
func (info *InodeInfo) Expiration() int64 {
return info.expiration
}
func (info *InodeInfo) SetExpiration(e int64) {
info.expiration = e
}
// String returns the string format of the inode.
func (info *InodeInfo) String() string {
return fmt.Sprintf("Inode(%v) Mode(%v) OsMode(%v) Nlink(%v) Size(%v) Uid(%v) Gid(%v) Gen(%v) QuotaIds(%v)",
info.Inode, info.Mode, OsMode(info.Mode), info.Nlink, info.Size, info.Uid, info.Gid, info.Generation, info.QuotaInfos)
}
type XAttrInfo struct {
Inode uint64
XAttrs map[string]string
}
func (info XAttrInfo) Get(key string) []byte {
return []byte(info.XAttrs[key])
}
func (info XAttrInfo) VisitAll(visitor func(key string, value []byte) bool) {
for k, v := range info.XAttrs {
if visitor == nil || !visitor(k, []byte(v)) {
return
}
}
}
func (info XAttrInfo) String() string {
builder := strings.Builder{}
for k, v := range info.XAttrs {
if builder.Len() != 0 {
builder.WriteString(",")
}
builder.WriteString(fmt.Sprintf("%s:%s", k, v))
}
return fmt.Sprintf("XAttrInfo{Inode(%v), XAttrs(%v)}", info.Inode, builder.String())
}
// Dentry defines the dentry struct.
type Dentry struct {
Name string `json:"name"`
Inode uint64 `json:"ino"`
Type uint32 `json:"type"`
}
// String returns the string format of the dentry.
func (d Dentry) String() string {
return fmt.Sprintf("Dentry{Name(%v),Inode(%v),Type(%v)}", d.Name, d.Inode, d.Type)
}
type RequestExtend struct {
FullPaths []string `json:"fullPaths"`
}
// NOTE: batch request may have multi full path
// values, but other request only have one
func (r *RequestExtend) GetFullPath() string {
if len(r.FullPaths) < 1 {
return ""
}
return r.FullPaths[0]
}
// CreateInodeRequest defines the request to create an inode.
type QuotaCreateInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Mode uint32 `json:"mode"`
Uid uint32 `json:"uid"`
Gid uint32 `json:"gid"`
Target []byte `json:"tgt"`
QuotaIds []uint32 `json:"qids"`
RequestExtend
}
type CreateInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Mode uint32 `json:"mode"`
Uid uint32 `json:"uid"`
Gid uint32 `json:"gid"`
Target []byte `json:"tgt"`
RequestExtend
}
// CreateInodeResponse defines the response to the request of creating an inode.
type CreateInodeResponse struct {
Info *InodeInfo `json:"info"`
}
type TxCreateRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
*TransactionInfo `json:"tx"`
}
type TxCreateResponse struct {
TxInfo *TransactionInfo `json:"tx"`
}
type TxApplyRMRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
*TransactionInfo `json:"tx"`
}
// TxCreateInodeRequest defines the request to create an inode with transaction info.
type TxCreateInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Mode uint32 `json:"mode"`
Uid uint32 `json:"uid"`
Gid uint32 `json:"gid"`
Target []byte `json:"tgt"`
QuotaIds []uint32 `json:"qids"`
TxInfo *TransactionInfo `json:"tx"`
RequestExtend
}
// TxCreateInodeResponse defines the response with transaction info to the request of creating an inode.
type TxCreateInodeResponse struct {
Info *InodeInfo `json:"info"`
TxInfo *TransactionInfo `json:"tx"`
}
const (
TxCommit int = 1 << iota
TxRollback
)
type TxApplyRequest struct {
TxID string `json:"tx"`
TmID uint64 `json:"tmid"`
TxApplyType int `json:"type"`
}
type TxSetStateRequest struct {
TxID string `json:"tx"`
State int32 `json:"state"`
}
type TxInodeApplyRequest struct {
TxID string `json:"txid"`
Inode uint64 `json:"ino"`
TxApplyType int `json:"type"`
ApplyFrom uint32 `json:"from"`
}
type TxDentryApplyRequest struct {
TxID string `json:"txid"`
// DenKey string `json:"denkey"`
Pid uint64 `json:"pid"`
Name string `json:"name"`
TxApplyType int `json:"type"`
ApplyFrom uint32 `json:"from"`
}
type TxGetInfoRequest struct {
VolName string `json:"vol"`
TxID string `json:"txid"`
Pid uint64 `json:"pid"`
}
type TxGetInfoResponse struct {
TxInfo *TransactionInfo `json:"tx"`
}
// LinkInodeRequest defines the request to link an inode.
type LinkInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
UniqID uint64 `json:"uiq"`
IsRename bool `json:"rename"`
RequestExtend
}
// LinkInodeResponse defines the response to the request of linking an inode.
type LinkInodeResponse struct {
Info *InodeInfo `json:"info"`
}
type TxLinkInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
TxInfo *TransactionInfo `json:"tx"`
RequestExtend
}
func (tx *TxLinkInodeRequest) GetInfo() string {
return tx.TxInfo.String()
}
type TxLinkInodeResponse struct {
Info *InodeInfo `json:"info"`
}
type ClearInodeCacheRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
}
type ClearInodeCacheResponse struct {
Info *InodeInfo `json:"info"`
}
type TxUnlinkInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Evict bool `json:"evict"`
TxInfo *TransactionInfo `json:"tx"`
RequestExtend
}
func (tx *TxUnlinkInodeRequest) GetInfo() string {
return tx.TxInfo.String()
}
type TxUnlinkInodeResponse struct {
Info *InodeInfo `json:"info"`
TxInfo *TransactionInfo `json:"tx"`
}
// UnlinkInodeRequest defines the request to unlink an inode.
type UnlinkInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
UniqID uint64 `json:"uid"` // for request dedup
VerSeq uint64 `json:"ver"`
DenVerSeq uint64 `json:"denVer"`
RequestExtend
}
// UnlinkInodeRequest defines the request to unlink an inode.
type BatchUnlinkInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inodes []uint64 `json:"inos"`
FullPaths []string `json:"fullPaths"`
}
// UnlinkInodeResponse defines the response to the request of unlinking an inode.
type UnlinkInodeResponse struct {
Info *InodeInfo `json:"info"`
}
// batch UnlinkInodeResponse defines the response to the request of unlinking an inode.
type BatchUnlinkInodeResponse struct {
Items []*struct {
Info *InodeInfo `json:"info"`
Status uint8 `json:"status"`
} `json:"items"`
}
// EvictInodeRequest defines the request to evict an inode.
type EvictInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
RequestExtend
}
// EvictInodeRequest defines the request to evict some inode.
type BatchEvictInodeRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inodes []uint64 `json:"inos"`
FullPaths []string `json:"fullPaths"`
}
// CreateDentryRequest defines the request to create a dentry.
type QuotaCreateDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Inode uint64 `json:"ino"`
Name string `json:"name"`
Mode uint32 `json:"mode"`
QuotaIds []uint32 `json:"qids"`
VerSeq uint64 `json:"seq"`
RequestExtend
}
type CreateDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Inode uint64 `json:"ino"`
Name string `json:"name"`
Mode uint32 `json:"mode"`
RequestExtend
}
type TxPack interface {
GetInfo() string
}
// TxCreateDentryRequest defines the request to create a dentry.
type TxCreateDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Inode uint64 `json:"ino"`
Name string `json:"name"`
Mode uint32 `json:"mode"`
QuotaIds []uint32 `json:"qids"`
TxInfo *TransactionInfo `json:"tx"`
RequestExtend
}
func (tx *TxCreateDentryRequest) GetInfo() string {
return tx.TxInfo.String()
}
type TxCreateDentryResponse struct {
TxInfo *TransactionInfo `json:"tx"`
}
// UpdateDentryRequest defines the request to update a dentry.
type UpdateDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Name string `json:"name"`
Inode uint64 `json:"ino"` // new inode number
RequestExtend
}
// UpdateDentryResponse defines the response to the request of updating a dentry.
type UpdateDentryResponse struct {
Inode uint64 `json:"ino"` // old inode number
}
type TxUpdateDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Name string `json:"name"`
Inode uint64 `json:"ino"` // new inode number
OldIno uint64 `json:"oldIno"` // new inode number
TxInfo *TransactionInfo `json:"tx"`
RequestExtend
}
func (tx *TxUpdateDentryRequest) GetInfo() string {
return tx.TxInfo.String()
}
type TxUpdateDentryResponse struct {
Inode uint64 `json:"ino"` // old inode number
}
type TxDeleteDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Name string `json:"name"`
Ino uint64 `json:"ino"`
TxInfo *TransactionInfo `json:"tx"`
RequestExtend
}
func (tx *TxDeleteDentryRequest) GetInfo() string {
return tx.TxInfo.String()
}
type TxDeleteDentryResponse struct {
Inode uint64 `json:"ino"`
}
// DeleteDentryRequest define the request tp delete a dentry.
type DeleteDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Name string `json:"name"`
InodeCreateTime int64 `json:"inodeCreateTime"`
Verseq uint64 `json:"ver"`
RequestExtend
}
type BatchDeleteDentryRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Dens []Dentry `json:"dens"`
FullPaths []string `json:"fullPaths"`
}
// DeleteDentryResponse defines the response to the request of deleting a dentry.
type DeleteDentryResponse struct {
Inode uint64 `json:"ino"`
}
// BatchDeleteDentryResponse defines the response to the request of deleting a dentry.
type BatchDeleteDentryResponse struct {
ParentID uint64 `json:"pino"`
Items []*struct {
Inode uint64 `json:"ino"`
Status uint8 `json:"status"`
} `json:"items"`
}
// LookupRequest defines the request for lookup.
type LookupRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Name string `json:"name"`
VerSeq uint64 `json:"seq"`
VerAll bool `json:"verAll"`
}
type DetryInfo struct {
Inode uint64 `json:"ino"`
Mode uint32 `json:"mode"`
VerSeq uint64 `json:"seq"`
IsDel bool `json:"isDel"`
}
// LookupResponse defines the response for the loopup request.
type LookupResponse struct {
Inode uint64 `json:"ino"`
Mode uint32 `json:"mode"`
VerSeq uint64 `json:"seq"`
LayAll []DetryInfo `json:"layerInfo"`
}
// InodeGetRequest defines the request to get the inode.
type InodeGetRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
VerSeq uint64 `json:"seq"`
VerAll bool `json:"verAll"`
}
type LayerInfo struct {
LayerIdx uint32 `json:"layerIdx"`
Info *InodeInfo `json:"info"`
Eks []ExtentKey `json:"eks"`
}
// InodeGetResponse defines the response to the InodeGetRequest.
type InodeGetResponse struct {
Info *InodeInfo `json:"info"`
LayAll []InodeInfo `json:"layerInfo"`
}
// BatchInodeGetRequest defines the request to get the inode in batch.
type BatchInodeGetRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inodes []uint64 `json:"inos"`
VerSeq uint64 `json:"seq"`
}
// BatchInodeGetResponse defines the response to the request of getting the inode in batch.
type BatchInodeGetResponse struct {
Infos []*InodeInfo `json:"infos"`
}
// InodeGetRequest defines the request to get the inode.
type InodeGetSplitRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
VerSeq uint64 `json:"seq"`
VerAll bool `json:"verAll"`
}
// InodeGetResponse defines the response to the InodeGetRequest.
type InodeGetSplitResponse struct {
Info *InodeSplitInfo `json:"info"`
LayAll []InodeSplitInfo `json:"layerInfo"`
}
// ReadDirRequest defines the request to read dir.
type ReadDirRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
VerSeq uint64 `json:"seq"`
}
type ReadDirOnlyRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
VerSeq uint64 `json:"seq"`
}
// ReadDirResponse defines the response to the request of reading dir.
type ReadDirResponse struct {
Children []Dentry `json:"children"`
}
type ReadDirOnlyResponse struct {
Children []Dentry `json:"children"`
}
// ReadDirLimitRequest defines the request to read dir with limited dentries.
type ReadDirLimitRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
ParentID uint64 `json:"pino"`
Marker string `json:"marker"`
Limit uint64 `json:"limit"`
VerSeq uint64 `json:"seq"`
VerOpt uint8 `json:"VerOpt"`
}
type ReadDirLimitResponse struct {
Children []Dentry `json:"children"`
}
// AppendExtentKeyRequest defines the request to append an extent key.
type AppendExtentKeyRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Extent ExtentKey `json:"ek"`
}
type AppendExtentKeyWithCheckRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Extent ExtentKey `json:"ek"`
DiscardExtents []ExtentKey `json:"dek"`
VerSeq uint64 `json:"seq"`
IsSplit bool
}
// AppendObjExtentKeyRequest defines the request to append an obj extent key.
type AppendObjExtentKeysRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Extents []ObjExtentKey `json:"ek"`
}
// GetExtentsRequest defines the reques to get extents.
type GetExtentsRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
VerSeq uint64 `json:"seq"`
VerAll bool
}
// GetObjExtentsResponse defines the response to the request of getting obj extents.
type GetObjExtentsResponse struct {
Generation uint64 `json:"gen"`
Size uint64 `json:"sz"`
Extents []ExtentKey `json:"eks"`
ObjExtents []ObjExtentKey `json:"objeks"`
}
// GetExtentsResponse defines the response to the request of getting extents.
type GetExtentsResponse struct {
Generation uint64 `json:"gen"`
Size uint64 `json:"sz"`
Extents []ExtentKey `json:"eks"`
LayerInfo []LayerInfo `json:"layer"`
Status int
}
// TruncateRequest defines the request to truncate.
type TruncateRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Size uint64 `json:"sz"`
RequestExtend
}
type EmptyExtentKeyRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
}
type DelVerRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
VerSeq uint64 `json:"ver"`
}
type DelExtentKeyRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Extents []ExtentKey `json:"ek"`
}
// SetAttrRequest defines the request to set attribute.
type SetAttrRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Mode uint32 `json:"mode"`
Uid uint32 `json:"uid"`
Gid uint32 `json:"gid"`
ModifyTime int64 `json:"mt"`
AccessTime int64 `json:"at"`
Valid uint32 `json:"valid"`
VerSeq uint64 `json:"seq"`
}
const (
AttrMode uint32 = 1 << iota
AttrUid
AttrGid
AttrModifyTime
AttrAccessTime
)
// DeleteInodeRequest defines the request to delete an inode.
type DeleteInodeRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
RequestExtend
}
// DeleteInodeRequest defines the request to delete an inode.
type DeleteInodeBatchRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inodes []uint64 `json:"ino"`
FullPaths []string `json:"fullPaths"`
}
// AppendExtentKeysRequest defines the request to append an extent key.
type AppendExtentKeysRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Extents []ExtentKey `json:"eks"`
}
type SetXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Key string `json:"key"`
Value string `json:"val"`
}
type BatchSetXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Attrs map[string]string `json:"attrs"`
}
type GetAllXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
VerSeq uint64 `json:"seq"`
}
type GetAllXAttrResponse struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Attrs map[string]string `json:"attrs"`
}
type GetXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Key string `json:"key"`
VerSeq uint64 `json:"seq"`
}
type GetXAttrResponse struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Key string `json:"key"`
Value string `json:"val"`
}
type RemoveXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Key string `json:"key"`
VerSeq uint64 `json:"seq"`
}
type ListXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
VerSeq uint64 `json:"seq"`
}
type ListXAttrResponse struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
XAttrs []string `json:"xattrs"`
}
type BatchGetXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inodes []uint64 `json:"inos"`
Keys []string `json:"keys"`
VerSeq uint64 `json:"seq"`
}
type BatchGetXAttrResponse struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
XAttrs []*XAttrInfo
}
type UpdateXAttrRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Key string `json:"key"`
Value string `json:"val"`
}
type MultipartInfo struct {
ID string `json:"id"`
Path string `json:"path"`
InitTime time.Time `json:"itime"`
Parts []*MultipartPartInfo `json:"parts"`
Extend map[string]string `json:"extend"`
}
type MultipartPartInfo struct {
ID uint16 `json:"id"`
Inode uint64 `json:"ino"`
MD5 string `json:"md5"`
Size uint64 `json:"sz"`
UploadTime time.Time `json:"ut"`
}
type CreateMultipartRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Path string `json:"path"`
Extend map[string]string `json:"extend"`
}
type CreateMultipartResponse struct {
Info *MultipartInfo `json:"info"`
}
type GetMultipartRequest struct {
VolName string `json:"vol"`
Path string `json:"path"`
PartitionId uint64 `json:"pid"`
MultipartId string `json:"mid"`
}
type GetMultipartResponse struct {
Info *MultipartInfo `json:"info"`
}
type GetExpiredMultipartRequest struct {
VolName string `json:"vol"`
Prefix string `json:"path"`
Days int `json:"days"`
PartitionId uint64 `json:"pid"`
}
type ExpiredMultipartInfo struct {
Path string `json:"path"`
MultipartId string `json:"mid"`
Inodes []uint64 `json:"inodes"`
}
type GetExpiredMultipartResponse struct {
Infos []*ExpiredMultipartInfo `json:"infos"`
}
type AddMultipartPartRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Path string `json:"path"`
MultipartId string `json:"mid"`
Part *MultipartPartInfo `json:"part"`
}
type RemoveMultipartRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Path string `json:"path"`
MultipartId string `json:"mid"`
}
type ListMultipartRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Marker string `json:"mk"`
MultipartIdMarker string `json:"mmk"`
Max uint64 `json:"max"`
Delimiter string `json:"dm"`
Prefix string `json:"pf"`
}
type ListMultipartResponse struct {
Multiparts []*MultipartInfo `json:"mps"`
}
type UpdateSummaryInfoRequest struct {
VolName string `json:"vol"`
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
Key string `json:"key"`
FileInc int64 `json:"fileinc"`
DirInc int64 `json:"dirinc"`
ByteInc int64 `json:"byteinc"`
}
type SetMasterQuotaReuqest struct {
VolName string `json:"vol"`
PathInfos []QuotaPathInfo `json:"pinfos"`
MaxFiles uint64 `json:"mf"`
MaxBytes uint64 `json:"mbyte"`
}
type UpdateMasterQuotaReuqest struct {
VolName string `json:"vol"`
QuotaId uint32 `json:"qid"`
MaxFiles uint64 `json:"mf"`
MaxBytes uint64 `json:"mbyte"`
}
type ListMasterQuotaResponse struct {
Quotas []*QuotaInfo
}
type BatchSetMetaserverQuotaReuqest struct {
PartitionId uint64 `json:"pid"`
Inodes []uint64 `json:"ino"`
QuotaId uint32 `json:"qid"`
IsRoot bool `json:"root"`
}
type BatchSetMetaserverQuotaResponse struct {
InodeRes map[uint64]uint8 `json:"inores"`
}
type BatchDeleteMetaserverQuotaReuqest struct {
PartitionId uint64 `json:"pid"`
Inodes []uint64 `json:"ino"`
QuotaId uint32 `json:"qid"`
}
type BatchDeleteMetaserverQuotaResponse struct {
InodeRes map[uint64]uint8 `json:"inores"`
}
type GetInodeQuotaRequest struct {
PartitionId uint64 `json:"pid"`
Inode uint64 `json:"ino"`
}
type GetInodeQuotaResponse struct {
MetaQuotaInfoMap map[uint32]*MetaQuotaInfo
}
type AppendMultipartResponse struct {
Status uint8 `json:"status"`
Update bool `json:"update"`
OldInode uint64 `json:"oldinode"`
}
type GetUniqIDRequest struct {
VolName string `json:"vol"`
PartitionID uint64 `json:"pid"`
Num uint32 `json:"num"`
}
type GetUniqIDResponse struct {
Start uint64 `json:"start"`
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"fmt"
"sync"
"time"
"github.com/cubefs/cubefs/util/log"
)
type LcConfiguration struct {
VolName string
Rules []*Rule
}
type Rule struct {
Expire *ExpirationConfig
Filter *FilterConfig
ID string
Status string
}
type ExpirationConfig struct {
Date *time.Time
Days int
}
type FilterConfig struct {
Prefix string
}
const (
RuleEnabled string = "Enabled"
RuleDisabled string = "Disabled"
)
func (lcConf *LcConfiguration) GenEnabledRuleTasks() []*RuleTask {
tasks := make([]*RuleTask, 0)
for _, r := range lcConf.Rules {
if r.Status != RuleEnabled {
log.LogDebugf("GenEnabledRuleTasks: skip disabled rule(%v) in volume(%v)", r.ID, lcConf.VolName)
continue
}
task := &RuleTask{
Id: fmt.Sprintf("%s:%s", lcConf.VolName, r.ID),
VolName: lcConf.VolName,
Rule: r,
}
tasks = append(tasks, task)
log.LogDebugf("GenEnabledRuleTasks: RuleTask(%v) generated from rule(%v) in volume(%v)", *task, r.ID, lcConf.VolName)
}
return tasks
}
// ----------------------------------------------
// lcnode <-> master
// LcNodeRuleTask
type LcNodeRuleTaskRequest struct {
MasterAddr string
LcNodeAddr string
Task *RuleTask
}
type RuleTask struct {
Id string
VolName string
Rule *Rule
}
type LcNodeRuleTaskResponse struct {
ID string
LcNode string
StartTime *time.Time
EndTime *time.Time
UpdateTime *time.Time
Done bool
Status uint8
Result string
LcNodeRuleTaskStatistics
}
type LcNodeRuleTaskStatistics struct {
Volume string
RuleId string
TotalInodeScannedNum int64
FileScannedNum int64
DirScannedNum int64
ExpiredNum int64
ErrorSkippedNum int64
}
// ----------------------------------
// lcnode <-> meta
type ScanDentry struct {
ParentId uint64 `json:"pid"` // FileID value of the parent inode.
Inode uint64 `json:"inode"` // FileID value of the current inode.
Name string `json:"name"` // Name of the current dentry.
Path string `json:"path"` // Path of the current dentry.
Type uint32 `json:"type"` // Type of the current dentry.
}
type BatchDentries struct {
sync.RWMutex
dentries map[uint64]*ScanDentry
}
func NewBatchDentries() *BatchDentries {
return &BatchDentries{
dentries: make(map[uint64]*ScanDentry),
}
}
func (f *BatchDentries) Append(dentry *ScanDentry) {
f.Lock()
defer f.Unlock()
f.dentries[dentry.Inode] = dentry
}
func (f *BatchDentries) Len() int {
f.RLock()
defer f.RUnlock()
return len(f.dentries)
}
func (f *BatchDentries) BatchGetAndClear() (map[uint64]*ScanDentry, []uint64) {
f.Lock()
defer f.Unlock()
dentries := f.dentries
var inodes []uint64
for i := range f.dentries {
inodes = append(inodes, i)
}
f.dentries = make(map[uint64]*ScanDentry)
return dentries, inodes
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import "sync"
// CreateNameSpaceRequest defines the request to create a name space.
type CreateNameSpaceRequest struct {
Name string
}
// CreateNameSpaceResponse defines the response to the request of creating a name space.
type CreateNameSpaceResponse struct {
Status int
Result string
}
// Peer defines the peer of the node id and address.
type Peer struct {
ID uint64 `json:"id"`
Addr string `json:"addr"`
}
// CreateMetaPartitionRequest defines the request to create a meta partition.
type CreateMetaPartitionRequest struct {
MetaId string
VolName string
Start uint64
End uint64
PartitionID uint64
Members []Peer
VerSeq uint64
}
// CreateMetaPartitionResponse defines the response to the request of creating a meta partition.
type CreateMetaPartitionResponse struct {
VolName string
PartitionID uint64
Status uint8
Result string
}
type UidSpaceInfo struct {
VolName string
Uid uint32
CTime int64
Enabled bool
Limited bool
UsedSize uint64
LimitSize uint64
Rsv string
}
type UidReportSpaceInfo struct {
Uid uint32
Size uint64
Rsv string
MTime int64
}
type QuotaUsedInfo struct {
UsedFiles int64
UsedBytes int64
}
type QuotaLimitedInfo struct {
LimitedFiles bool
LimitedBytes bool
}
type QuotaReportInfo struct {
QuotaId uint32
UsedInfo QuotaUsedInfo
}
type QuotaInfo struct {
VolName string
QuotaId uint32
CTime int64
PathInfos []QuotaPathInfo
LimitedInfo QuotaLimitedInfo
UsedInfo QuotaUsedInfo
MaxFiles uint64
MaxBytes uint64
Rsv string
}
type QuotaHeartBeatInfo struct {
VolName string
QuotaId uint32
LimitedInfo QuotaLimitedInfo
Enable bool
}
type MetaQuotaInfos struct {
QuotaInfoMap map[uint32]*MetaQuotaInfo
sync.RWMutex
}
type MetaQuotaInfo struct {
RootInode bool `json:"rid"`
}
type QuotaPathInfo struct {
FullPath string
RootInode uint64
PartitionId uint64
}
func (usedInfo *QuotaUsedInfo) Add(info *QuotaUsedInfo) {
usedInfo.UsedFiles += info.UsedFiles
usedInfo.UsedBytes += info.UsedBytes
}
func (quotaInfo *QuotaInfo) IsOverQuotaFiles() (isOver bool) {
if uint64(quotaInfo.UsedInfo.UsedFiles) > quotaInfo.MaxFiles {
isOver = true
} else {
isOver = false
}
return
}
func (quotaInfo *QuotaInfo) IsOverQuotaBytes() (isOver bool) {
if uint64(quotaInfo.UsedInfo.UsedBytes) > quotaInfo.MaxBytes {
isOver = true
} else {
isOver = false
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"fmt"
"sync"
"time"
"github.com/cubefs/cubefs/util/log"
)
const (
DefaultZoneName = "default"
)
// MetaNode defines the structure of a meta node
type MetaNodeInfo struct {
ID uint64
Addr string
DomainAddr string
IsActive bool
IsWriteAble bool
ZoneName string `json:"Zone"`
MaxMemAvailWeight uint64 `json:"MaxMemAvailWeight"`
Total uint64 `json:"TotalWeight"`
Used uint64 `json:"UsedWeight"`
Ratio float64
SelectCount uint64
Threshold float32
ReportTime time.Time
MetaPartitionCount int
NodeSetID uint64
PersistenceMetaPartitions []uint64
RdOnly bool
CpuUtil float64 `json:"cpuUtil"`
}
// DataNode stores all the information about a data node
type DataNodeInfo struct {
Total uint64 `json:"TotalWeight"`
Used uint64 `json:"UsedWeight"`
AvailableSpace uint64
ID uint64
ZoneName string `json:"Zone"`
Addr string
DomainAddr string
ReportTime time.Time
IsActive bool
IsWriteAble bool
UsageRatio float64 // used / total space
SelectedTimes uint64 // number times that this datanode has been selected as the location for a data partition.
DataPartitionReports []*DataPartitionReport
DataPartitionCount uint32
NodeSetID uint64
PersistenceDataPartitions []uint64
BadDisks []string
RdOnly bool
MaxDpCntLimit uint32 `json:"maxDpCntLimit"`
CpuUtil float64 `json:"cpuUtil"`
IoUtils map[string]float64 `json:"ioUtil"`
}
// MetaPartition defines the structure of a meta partition
type MetaPartitionInfo struct {
PartitionID uint64
Start uint64
End uint64
MaxInodeID uint64
InodeCount uint64
DentryCount uint64
VolName string
Replicas []*MetaReplicaInfo
ReplicaNum uint8
Status int8
IsRecover bool
Hosts []string
Peers []Peer
Zones []string
NodeSets []uint64
OfflinePeerID uint64
MissNodes map[string]int64
LoadResponse []*MetaPartitionLoadResponse
Forbidden bool
}
// MetaReplica defines the replica of a meta partition
type MetaReplicaInfo struct {
Addr string
DomainAddr string
MaxInodeID uint64
ReportTime int64
Status int8 // unavailable, readOnly, readWrite
IsLeader bool
InodeCount uint64
MaxInode uint64
DentryCount uint64
}
// ClusterView provides the view of a cluster.
type ClusterView struct {
Name string
CreateTime string
LeaderAddr string
DisableAutoAlloc bool
ForbidMpDecommission bool
MetaNodeThreshold float32
Applied uint64
MaxDataPartitionID uint64
MaxMetaNodeID uint64
MaxMetaPartitionID uint64
DataNodeStatInfo *NodeStatInfo
MetaNodeStatInfo *NodeStatInfo
VolStatInfo []*VolStatInfo
BadPartitionIDs []BadPartitionView
BadMetaPartitionIDs []BadPartitionView
MasterNodes []NodeView
MetaNodes []NodeView
DataNodes []NodeView
}
// ClusterNode defines the structure of a cluster node
type ClusterNodeInfo struct {
// BatchCount int
LoadFactor string
// MarkDeleteRate int
// AutoRepairRate int
// DeleteWorkerSleepMs int
}
type ClusterIP struct {
Cluster string
// MetaNodeDeleteBatchCount int
// MetaNodeDeleteWorkerSleepMs int
// DataNodeDeleteLimitRate int
// DataNodeAutoRepairLimitRate int
// Ip string
EbsAddr string
// ServicePath string
}
// NodeView provides the view of the data or meta node.
type NodeView struct {
Addr string
IsActive bool
DomainAddr string
ID uint64
IsWritable bool
}
type DpRepairInfo struct {
PartitionID uint64
DecommissionRepairProgress float64
}
type BadPartitionRepairView struct {
Path string
PartitionInfos []DpRepairInfo
}
type BadPartitionView struct {
Path string
PartitionIDs []uint64
}
type ClusterStatInfo struct {
DataNodeStatInfo *NodeStatInfo
MetaNodeStatInfo *NodeStatInfo
ZoneStatInfo map[string]*ZoneStat
}
type ZoneStat struct {
DataNodeStat *ZoneNodesStat
MetaNodeStat *ZoneNodesStat
}
type ZoneNodesStat struct {
Total float64 `json:"TotalGB"`
Used float64 `json:"UsedGB"`
Avail float64 `json:"AvailGB"`
UsedRatio float64
TotalNodes int
WritableNodes int
}
type NodeSetStat struct {
ID uint64
Capacity int
Zone string
MetaNodeNum int
DataNodeNum int
}
type NodeSetStatInfo struct {
ID uint64
Capacity int
Zone string
MetaNodes []*NodeStatView
DataNodes []*NodeStatView
DataNodeSelector string
MetaNodeSelector string
}
type NodeStatView struct {
Addr string
Status bool
DomainAddr string
ID uint64
IsWritable bool
Total uint64
Used uint64
Avail uint64
}
type NodeStatInfo struct {
TotalGB uint64
UsedGB uint64
IncreasedGB int64
UsedRatio string
AvailGB uint64
}
type VolStatInfo struct {
Name string
TotalSize uint64
UsedSize uint64
UsedRatio string
CacheTotalSize uint64
CacheUsedSize uint64
CacheUsedRatio string
EnableToken bool
InodeCount uint64
TxCnt uint64
TxRbInoCnt uint64
TxRbDenCnt uint64
DpReadOnlyWhenVolFull bool
}
// DataPartition represents the structure of storing the file contents.
type DataPartitionInfo struct {
PartitionID uint64
PartitionTTL int64
PartitionType int
LastLoadedTime int64
ReplicaNum uint8
Status int8
Recover bool
Replicas []*DataReplica
Hosts []string // host addresses
Peers []Peer
Zones []string
NodeSets []uint64
MissingNodes map[string]int64 // key: address of the missing node, value: when the node is missing
VolName string
VolID uint64
OfflinePeerID uint64
FileInCoreMap map[string]*FileInCore
IsRecover bool
FilesWithMissingReplica map[string]int64 // key: file name, value: last time when a missing replica is found
SingleDecommissionStatus uint32
SingleDecommissionAddr string
RdOnly bool
IsDiscard bool
Forbidden bool
}
// FileInCore define file in data partition
type FileInCore struct {
Name string
LastModify int64
MetadataArray []*FileMetadata
}
// FileMetadata defines the file metadata on a dataNode
type FileMetadata struct {
Crc uint32
LocAddr string
Size uint32
ApplyID uint64
}
// DataReplica represents the replica of a data partition
type DataReplica struct {
Addr string
DomainAddr string
ReportTime int64
FileCount uint32
Status int8
HasLoadResponse bool // if there is any response when loading
Total uint64 `json:"TotalSize"`
Used uint64 `json:"UsedSize"`
IsLeader bool
NeedsToCompare bool
DiskPath string
DecommissionRepairProgress float64
}
// data partition diagnosis represents the inactive data nodes, corrupt data partitions, and data partitions lack of replicas
type DataPartitionDiagnosis struct {
InactiveDataNodes []string
CorruptDataPartitionIDs []uint64
LackReplicaDataPartitionIDs []uint64
RepFileCountDifferDpIDs []uint64
RepUsedSizeDifferDpIDs []uint64
ExcessReplicaDpIDs []uint64
// BadDataPartitionIDs []BadPartitionView
BadDataPartitionInfos []BadPartitionRepairView
BadReplicaDataPartitionIDs []uint64
}
// meta partition diagnosis represents the inactive meta nodes, corrupt meta partitions, and meta partitions lack of replicas
type MetaPartitionDiagnosis struct {
InactiveMetaNodes []string
CorruptMetaPartitionIDs []uint64
LackReplicaMetaPartitionIDs []uint64
BadMetaPartitionIDs []BadPartitionView
BadReplicaMetaPartitionIDs []uint64
ExcessReplicaMetaPartitionIDs []uint64
InodeCountNotEqualReplicaMetaPartitionIDs []uint64
MaxInodeNotEqualReplicaMetaPartitionIDs []uint64
DentryCountNotEqualReplicaMetaPartitionIDs []uint64
}
type DecommissionProgress struct {
Status uint32
Progress string
FailedDps []uint64
StatusMessage string
}
type BadDiskInfo struct {
Address string
Path string
TotalPartitionCnt int
DiskErrPartitionList []uint64
}
type BadDiskInfos struct {
BadDisks []BadDiskInfo
}
type DiscardDataPartitionInfos struct {
DiscardDps []DataPartitionInfo
}
type VolVersionInfo struct {
Ver uint64 // unixMicro of createTime used as version
DelTime int64
Status uint8 // building,normal,deleted,abnormal
}
func (vv *VolVersionInfo) String() string {
return fmt.Sprintf("Ver:%v|DelTimt:%v|status:%v", vv.Ver, vv.DelTime, vv.Status)
}
type VolVersionInfoList struct {
VerList []*VolVersionInfo
Strategy VolumeVerStrategy
TemporaryVerMap map[uint64]*VolVersionInfo
RWLock sync.RWMutex
}
func (v *VolVersionInfoList) GetNextOlderVer(ver uint64) (verSeq uint64, err error) {
v.RWLock.RLock()
defer v.RWLock.RUnlock()
log.LogDebugf("getNextOlderVer ver %v", ver)
for idx, info := range v.VerList {
log.LogDebugf("getNextOlderVer id %v ver %v info %v", idx, info.Ver, info)
if info.Ver >= ver {
if idx == 0 {
return 0, fmt.Errorf("not found")
}
return v.VerList[idx-1].Ver, nil
}
}
log.LogErrorf("getNextOlderVer ver %v not found", ver)
return 0, fmt.Errorf("version not exist")
}
func (v *VolVersionInfoList) GetNextNewerVer(ver uint64) (verSeq uint64, err error) {
log.LogDebugf("getNextOlderVer ver %v", ver)
for idx, info := range v.VerList {
log.LogDebugf("getNextOlderVer id %v ver %v info %v", idx, info.Ver, info)
if info.Ver > ver {
return info.Ver, nil
}
}
log.LogErrorf("getNextOlderVer ver %v not found", ver)
return 0, fmt.Errorf("version not exist")
}
func (v *VolVersionInfoList) GetLastVolVerInfo() *VolVersionInfo {
if len(v.VerList) == 0 {
return nil
}
return v.VerList[len(v.VerList)-1]
}
func (v *VolVersionInfoList) GetLastVer() uint64 {
if len(v.VerList) == 0 {
return 0
}
return v.VerList[len(v.VerList)-1].Ver
}
type DecommissionDiskLimitDetail struct {
NodeSetId uint64
Limit int
}
type DecommissionDiskLimit struct {
Details []DecommissionDiskLimitDetail
}
type DecommissionDiskInfo struct {
SrcAddr string
DiskPath string
DecommissionStatus uint32
DecommissionRaftForce bool
DecommissionRetry uint8
DecommissionDpTotal int
DecommissionTerm uint64
DecommissionLimit int
Type uint32
DecommissionCompleteTime int64
Progress float64
}
type DecommissionDisksResponse struct {
Infos []DecommissionDiskInfo
}
package proto
import (
"flag"
"fmt"
"strconv"
"github.com/cubefs/cubefs/util/auth"
"github.com/cubefs/cubefs/util/config"
)
// For client
const (
// Mandatory
MountPoint int = iota
VolName
Owner
Master
// Optional
LogDir
WarnLogDir
LogLevel
ProfPort
IcacheTimeout
LookupValid
AttrValid
ReadRate
WriteRate
EnSyncWrite
AutoInvalData
Rdonly
WriteCache
KeepCache
FollowerRead
Authenticate
ClientKey
TicketHost
EnableHTTPS
CertFile
AccessKey
SecretKey
DisableDcache
SubDir
FsyncOnClose
MaxCPUs
EnableXattr
NearRead
EnablePosixACL
EnableSummary
EnableUnixPermission
RequestTimeout
// adls
VolType
EbsEndpoint
EbsServerPath
CacheAction
EbsBlockSize
EnableBcache
BcacheDir
BcacheFilterFiles
BcacheBatchCnt
BcacheCheckIntervalS
ReadThreads
WriteThreads
MetaSendTimeout
BuffersTotalLimit
MaxStreamerLimit
EnableAudit
LocallyProf
MinWriteAbleDataPartitionCnt
FileSystemName
// snapshot
SnapshotReadVerSeq
DisableMountSubtype
MaxMountOption
)
// For server
const (
MasterAddr = "masterAddr"
ListenPort = "listen"
ObjectNodeDomain = "objectNodeDomain"
BindIpKey = "bindIp"
)
type MountOption struct {
keyword string
description string
cmdlineValue string
value interface{}
}
func (opt MountOption) String() string {
return fmt.Sprintf("[%v] %T: %v", opt.keyword, opt.value, opt.value)
}
func NewMountOptions() []MountOption {
opts := make([]MountOption, MaxMountOption)
return opts
}
func InitMountOptions(opts []MountOption) {
opts[MountPoint] = MountOption{"mountPoint", "Mount Point", "", ""}
opts[VolName] = MountOption{"volName", "Volume Name", "", ""}
opts[Owner] = MountOption{"owner", "Owner", "", ""}
opts[Master] = MountOption{MasterAddr, "Master Address", "", ""}
opts[LogDir] = MountOption{"logDir", "Log Path", "", ""}
opts[WarnLogDir] = MountOption{"warnLogDir", "Warn Log Path", "", ""}
opts[LogLevel] = MountOption{"logLevel", "Log Level", "", ""}
opts[ProfPort] = MountOption{"profPort", "PProf Port", "", ""}
opts[LocallyProf] = MountOption{"locallyProf", "Locally PProf", "", false}
opts[IcacheTimeout] = MountOption{"icacheTimeout", "Inode Cache Expiration Time", "", int64(-1)}
opts[LookupValid] = MountOption{"lookupValid", "Lookup Valid Duration", "", int64(-1)}
opts[AttrValid] = MountOption{"attrValid", "Attr Valid Duration", "", int64(-1)}
opts[ReadRate] = MountOption{"readRate", "Read Rate Limit", "", int64(-1)}
opts[WriteRate] = MountOption{"writeRate", "Write Rate Limit", "", int64(-1)}
opts[EnSyncWrite] = MountOption{"enSyncWrite", "Enable Sync Write", "", int64(-1)}
opts[AutoInvalData] = MountOption{"autoInvalData", "Auto Invalidate Data", "", int64(-1)}
opts[Rdonly] = MountOption{"rdonly", "Mount as readonly", "", false}
opts[WriteCache] = MountOption{"writecache", "Enable FUSE writecache feature", "", false}
opts[KeepCache] = MountOption{"keepcache", "Enable FUSE keepcache feature", "", false}
opts[FollowerRead] = MountOption{"followerRead", "Enable read from follower", "", false}
opts[NearRead] = MountOption{"nearRead", "Enable read from nearest node", "", true}
opts[Authenticate] = MountOption{"authenticate", "Enable Authenticate", "", false}
opts[ClientKey] = MountOption{"clientKey", "Client Key", "", ""}
opts[TicketHost] = MountOption{"ticketHost", "Ticket Host", "", ""}
opts[EnableHTTPS] = MountOption{"enableHTTPS", "Enable HTTPS", "", false}
opts[CertFile] = MountOption{"certFile", "Cert File", "", ""}
opts[AccessKey] = MountOption{"accessKey", "Access Key", "", ""}
opts[SecretKey] = MountOption{"secretKey", "Secret Key", "", ""}
opts[DisableDcache] = MountOption{"disableDcache", "Disable Dentry Cache", "", false}
opts[SubDir] = MountOption{"subdir", "Mount sub directory", "", ""}
opts[FsyncOnClose] = MountOption{"fsyncOnClose", "Perform fsync upon file close", "", true}
opts[MaxCPUs] = MountOption{"maxcpus", "The maximum number of CPUs that can be executing", "", int64(-1)}
opts[EnableXattr] = MountOption{"enableXattr", "Enable xattr support", "", false}
opts[EnablePosixACL] = MountOption{"enablePosixACL", "Enable posix ACL support", "", false}
opts[EnableSummary] = MountOption{"enableSummary", "Enable content summary", "", false}
opts[EnableUnixPermission] = MountOption{"enableUnixPermission", "Enable unix permission check(e.g: 777/755)", "", false}
opts[VolType] = MountOption{"volType", "volume type", "", int64(0)}
opts[EbsEndpoint] = MountOption{"ebsEndpoint", "Ebs service address", "", ""}
opts[EbsServerPath] = MountOption{"ebsServerPath", "Ebs service path", "", ""}
opts[CacheAction] = MountOption{"cacheAction", "Cold cache action", "", int64(0)}
opts[EbsBlockSize] = MountOption{"ebsBlockSize", "Ebs object size", "", ""}
// opts[EnableBcache] = MountOption{"enableBcache", "Enable block cache", "", false}
opts[BcacheDir] = MountOption{"bcacheDir", "block cache dir", "", ""}
opts[ReadThreads] = MountOption{"readThreads", "Cold volume read threads", "", int64(10)}
opts[WriteThreads] = MountOption{"writeThreads", "Cold volume write threads", "", int64(10)}
opts[MetaSendTimeout] = MountOption{"metaSendTimeout", "Meta send timeout", "", int64(600)}
opts[BuffersTotalLimit] = MountOption{"buffersTotalLimit", "Send/Receive packets memory limit", "", int64(32768)} // default 4G
opts[MaxStreamerLimit] = MountOption{"maxStreamerLimit", "The maximum number of streamers", "", int64(0)} // default 0
opts[BcacheFilterFiles] = MountOption{"bcacheFilterFiles", "The block cache filter files suffix", "", "py;pyx;sh;yaml;conf;pt;pth;log;out"}
opts[BcacheBatchCnt] = MountOption{"bcacheBatchCnt", "The block cache get meta count", "", int64(100000)}
opts[BcacheCheckIntervalS] = MountOption{"bcacheCheckIntervalS", "The block cache check interval", "", int64(300)}
opts[EnableAudit] = MountOption{"enableAudit", "enable client audit logging", "", false}
opts[RequestTimeout] = MountOption{"requestTimeout", "The Request Expiration Time", "", int64(0)}
opts[MinWriteAbleDataPartitionCnt] = MountOption{
"minWriteAbleDataPartitionCnt",
"Min writeable data partition count retained int dpSelector when update DataPartitionsView from master",
"", int64(10),
}
opts[FileSystemName] = MountOption{"fileSystemName", "The explicit name of the filesystem", "", ""}
opts[SnapshotReadVerSeq] = MountOption{"snapshotReadSeq", "Snapshot read seq", "", int64(0)} // default false
opts[DisableMountSubtype] = MountOption{"disableMountSubtype", "Disable Mount Subtype", "", false}
for i := 0; i < MaxMountOption; i++ {
flag.StringVar(&opts[i].cmdlineValue, opts[i].keyword, "", opts[i].description)
}
}
func ParseMountOptions(opts []MountOption, cfg *config.Config) {
for i := 0; i < MaxMountOption; i++ {
switch v := opts[i].value.(type) {
case string:
if opts[i].cmdlineValue != "" {
opts[i].value = opts[i].cmdlineValue
} else {
if value, present := cfg.CheckAndGetString(opts[i].keyword); present {
opts[i].value = value
} else {
opts[i].value = v
}
}
fmt.Printf("keyword[%v] value[%v] type[%T]\n", opts[i].keyword, opts[i].value, v)
case int64:
if opts[i].cmdlineValue != "" {
opts[i].value = parseInt64(opts[i].cmdlineValue)
} else {
if present := cfg.HasKey(opts[i].keyword); present {
opts[i].value = cfg.GetInt64(opts[i].keyword)
} else {
opts[i].value = v
}
}
fmt.Printf("keyword[%v] value[%v] type[%T]\n", opts[i].keyword, opts[i].value, v)
case bool:
if opts[i].cmdlineValue != "" {
opts[i].value = parseBool(opts[i].cmdlineValue)
} else {
if value, present := cfg.CheckAndGetBool(opts[i].keyword); present {
opts[i].value = value
} else {
opts[i].value = v
}
}
fmt.Printf("keyword[%v] value[%v] type[%T]\n", opts[i].keyword, opts[i].value, v)
default:
fmt.Printf("keyword[%v] unknown type[%T]\n", opts[i].keyword, v)
}
}
}
func parseInt64(s string) int64 {
var ret int64 = -1
if s != "" {
val, err := strconv.Atoi(s)
if err == nil {
ret = int64(val)
}
}
return ret
}
func parseBool(s string) bool {
var ret bool = false
if s == "true" {
ret = true
}
return ret
}
func (opt *MountOption) GetString() string {
val, ok := opt.value.(string)
if !ok {
return ""
}
return val
}
func (opt *MountOption) GetBool() bool {
val, ok := opt.value.(bool)
if !ok {
return false
}
return val
}
func (opt *MountOption) GetInt64() int64 {
val, ok := opt.value.(int64)
if !ok {
return int64(-1)
}
return val
}
type MountOptions struct {
Config *config.Config
MountPoint string
Volname string
Owner string
Master string
Logpath string
Loglvl string
Profport string
LocallyProf bool
IcacheTimeout int64
LookupValid int64
AttrValid int64
ReadRate int64
WriteRate int64
EnSyncWrite int64
AutoInvalData int64
UmpDatadir string
Rdonly bool
WriteCache bool
KeepCache bool
FollowerRead bool
Authenticate bool
TicketMess auth.TicketMess
TokenKey string
AccessKey string
SecretKey string
DisableDcache bool
SubDir string
FsyncOnClose bool
MaxCPUs int64
EnableXattr bool
NearRead bool
EnablePosixACL bool
EnableQuota bool
EnableTransaction string
TxTimeout int64
TxConflictRetryNum int64
TxConflictRetryInterval int64
VolType int
EbsEndpoint string
EbsServicePath string
CacheAction int
CacheThreshold int
EbsBlockSize int
EnableBcache bool
BcacheDir string
BcacheFilterFiles string
BcacheCheckIntervalS int64
BcacheBatchCnt int64
ReadThreads int64
WriteThreads int64
EnableSummary bool
EnableUnixPermission bool
NeedRestoreFuse bool
MetaSendTimeout int64
BuffersTotalLimit int64
MaxStreamerLimit int64
EnableAudit bool
RequestTimeout int64
MinWriteAbleDataPartitionCnt int
FileSystemName string
VerReadSeq uint64
// disable mount subtype
DisableMountSubtype bool
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"bytes"
"encoding/binary"
"fmt"
"github.com/cubefs/cubefs/util/btree"
)
type Blob struct {
MinBid uint64
Count uint64
Vid uint64
}
// ObjExtentKey defines the extent key struct.
type ObjExtentKey struct {
Cid uint64 // cluster id
CodeMode uint8 // EC encode and decode mode
BlobSize uint32 // block size
BlobsLen uint32 // blob array length
Size uint64 // objExtentKey size
Blobs []Blob
FileOffset uint64 // obj offset in file
Crc uint32
// snapshot
VerSeq uint64
ModGen uint64
}
// String returns the string format of the extentKey.
func (k ObjExtentKey) String() string {
return fmt.Sprintf("ObjExtentKey{FileOffset(%v),Cid(%v),CodeMode(%v),BlobSize(%v),BlobsLen(%v),Blobs(%v),Size(%v),Crc(%v)}", k.FileOffset, k.Cid, k.CodeMode, k.BlobSize, k.BlobsLen, k.Blobs, k.Size, k.Crc)
}
// Less defines the less comparator.
func (k *ObjExtentKey) Less(than btree.Item) bool {
that := than.(*ObjExtentKey)
return k.FileOffset < that.FileOffset
}
// Marshal marshals the obj extent key.
func (k *ObjExtentKey) Copy() btree.Item {
return k
}
func (k *ObjExtentKey) IsEquals(obj *ObjExtentKey) bool {
if k.FileOffset != obj.FileOffset {
return false
}
if k.Cid != obj.Cid {
return false
}
if k.CodeMode != obj.CodeMode {
return false
}
if k.BlobSize != obj.BlobSize {
return false
}
if k.BlobsLen != obj.BlobsLen {
return false
}
if k.Size != obj.Size {
return false
}
if k.Crc != obj.Crc {
return false
}
if len(k.Blobs) > 0 {
for i := len(k.Blobs) - 1; i >= 0; i-- {
if k.Blobs[i].Count != obj.Blobs[i].Count || k.Blobs[i].MinBid != obj.Blobs[i].MinBid || k.Blobs[i].Vid != obj.Blobs[i].Vid {
return false
}
}
}
return true
}
// MarshalBinary marshals the binary format of the extent key.
func (k *ObjExtentKey) MarshalBinary() ([]byte, error) {
buf := bytes.NewBuffer(make([]byte, 0))
if err := binary.Write(buf, binary.BigEndian, uint32(len(k.Blobs))); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.FileOffset); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.Size); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.Crc); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.CodeMode); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.Cid); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.BlobSize); err != nil {
return nil, err
}
if err := binary.Write(buf, binary.BigEndian, k.Blobs); err != nil {
return nil, err
}
return buf.Bytes(), nil
}
func (k *ObjExtentKey) UnmarshalBinary(buf *bytes.Buffer) (err error) {
if err = binary.Read(buf, binary.BigEndian, &k.BlobsLen); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.FileOffset); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.Size); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.Crc); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.CodeMode); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.Cid); err != nil {
return
}
if err = binary.Read(buf, binary.BigEndian, &k.BlobSize); err != nil {
return
}
blobs := make([]Blob, 0, int(k.BlobsLen))
for i := 0; i < int(k.BlobsLen); i++ {
tmpBlob := Blob{}
if err = binary.Read(buf, binary.BigEndian, &tmpBlob); err != nil {
return
}
blobs = append(blobs, tmpBlob)
}
k.Blobs = blobs
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"bytes"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io"
"net"
"strconv"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/buf"
"github.com/cubefs/cubefs/util/log"
)
var (
GRequestID = int64(1)
Buffers *buf.BufferPool
)
// GenerateRequestID generates the request ID.
func GenerateRequestID() int64 {
return atomic.AddInt64(&GRequestID, 1)
}
const (
AddrSplit = "/"
)
// Operations
const (
ProtoMagic uint8 = 0xFF
OpInitResultCode uint8 = 0x00
OpCreateExtent uint8 = 0x01
OpMarkDelete uint8 = 0x02
OpWrite uint8 = 0x03
OpRead uint8 = 0x04
OpStreamRead uint8 = 0x05
OpStreamFollowerRead uint8 = 0x06
OpGetAllWatermarks uint8 = 0x07
OpNotifyReplicasToRepair uint8 = 0x08
OpExtentRepairRead uint8 = 0x09
OpBroadcastMinAppliedID uint8 = 0x0A
OpRandomWrite uint8 = 0x0F
OpGetAppliedId uint8 = 0x10
OpGetPartitionSize uint8 = 0x11
OpSyncRandomWrite uint8 = 0x12
OpSyncWrite uint8 = 0x13
OpReadTinyDeleteRecord uint8 = 0x14
OpTinyExtentRepairRead uint8 = 0x15
OpGetMaxExtentIDAndPartitionSize uint8 = 0x16
// Operations: Client -> MetaNode.
OpMetaCreateInode uint8 = 0x20
OpMetaUnlinkInode uint8 = 0x21
OpMetaCreateDentry uint8 = 0x22
OpMetaDeleteDentry uint8 = 0x23
OpMetaOpen uint8 = 0x24
OpMetaLookup uint8 = 0x25
OpMetaReadDir uint8 = 0x26
OpMetaInodeGet uint8 = 0x27
OpMetaBatchInodeGet uint8 = 0x28
OpMetaExtentsAdd uint8 = 0x29
OpMetaExtentsDel uint8 = 0x2A
OpMetaExtentsList uint8 = 0x2B
OpMetaUpdateDentry uint8 = 0x2C
OpMetaTruncate uint8 = 0x2D
OpMetaLinkInode uint8 = 0x2E
OpMetaEvictInode uint8 = 0x2F
OpMetaSetattr uint8 = 0x30
OpMetaReleaseOpen uint8 = 0x31
// Operations: MetaNode Leader -> MetaNode Follower
OpMetaFreeInodesOnRaftFollower uint8 = 0x32
OpMetaDeleteInode uint8 = 0x33 // delete specified inode immediately and do not remove data.
OpMetaBatchExtentsAdd uint8 = 0x34 // for extents batch attachment
OpMetaSetXAttr uint8 = 0x35
OpMetaGetXAttr uint8 = 0x36
OpMetaRemoveXAttr uint8 = 0x37
OpMetaListXAttr uint8 = 0x38
OpMetaBatchGetXAttr uint8 = 0x39
OpMetaExtentAddWithCheck uint8 = 0x3A // Append extent key with discard extents check
OpMetaReadDirLimit uint8 = 0x3D
// Operations: Master -> MetaNode
OpCreateMetaPartition uint8 = 0x40
OpMetaNodeHeartbeat uint8 = 0x41
OpDeleteMetaPartition uint8 = 0x42
OpUpdateMetaPartition uint8 = 0x43
OpLoadMetaPartition uint8 = 0x44
OpDecommissionMetaPartition uint8 = 0x45
OpAddMetaPartitionRaftMember uint8 = 0x46
OpRemoveMetaPartitionRaftMember uint8 = 0x47
OpMetaPartitionTryToLeader uint8 = 0x48
// Quota
OpMetaBatchSetInodeQuota uint8 = 0x50
OpMetaBatchDeleteInodeQuota uint8 = 0x51
OpMetaGetInodeQuota uint8 = 0x52
OpQuotaCreateInode uint8 = 0x53
OpQuotaCreateDentry uint8 = 0x54
// Operations: Master -> LcNode
OpLcNodeHeartbeat uint8 = 0x55
OpLcNodeScan uint8 = 0x56
OpLcNodeSnapshotVerDel uint8 = 0x57
// Operations: Master -> DataNode
OpCreateDataPartition uint8 = 0x60
OpDeleteDataPartition uint8 = 0x61
OpLoadDataPartition uint8 = 0x62
OpDataNodeHeartbeat uint8 = 0x63
OpReplicateFile uint8 = 0x64
OpDeleteFile uint8 = 0x65
OpDecommissionDataPartition uint8 = 0x66
OpAddDataPartitionRaftMember uint8 = 0x67
OpRemoveDataPartitionRaftMember uint8 = 0x68
OpDataPartitionTryToLeader uint8 = 0x69
OpQos uint8 = 0x6A
OpStopDataPartitionRepair uint8 = 0x6B
// Operations: MultipartInfo
OpCreateMultipart uint8 = 0x70
OpGetMultipart uint8 = 0x71
OpAddMultipartPart uint8 = 0x72
OpRemoveMultipart uint8 = 0x73
OpListMultiparts uint8 = 0x74
OpBatchDeleteExtent uint8 = 0x75 // SDK to MetaNode
OpGetExpiredMultipart uint8 = 0x76
// Operations: MetaNode Leader -> MetaNode Follower
OpMetaBatchDeleteInode uint8 = 0x90
OpMetaBatchDeleteDentry uint8 = 0x91
OpMetaBatchUnlinkInode uint8 = 0x92
OpMetaBatchEvictInode uint8 = 0x93
// Transaction Operations: Client -> MetaNode.
OpMetaTxCreate uint8 = 0xA0
OpMetaTxCreateInode uint8 = 0xA1
OpMetaTxUnlinkInode uint8 = 0xA2
OpMetaTxCreateDentry uint8 = 0xA3
OpTxCommit uint8 = 0xA4
OpTxRollback uint8 = 0xA5
OpTxCommitRM uint8 = 0xA6
OpTxRollbackRM uint8 = 0xA7
OpMetaTxDeleteDentry uint8 = 0xA8
OpMetaTxUpdateDentry uint8 = 0xA9
OpMetaTxLinkInode uint8 = 0xAA
OpMetaTxGet uint8 = 0xAB
// Operations: Client -> MetaNode.
OpMetaGetUniqID uint8 = 0xAC
// Multi version snapshot
OpRandomWriteAppend uint8 = 0xB1
OpSyncRandomWriteAppend uint8 = 0xB2
OpRandomWriteVer uint8 = 0xB3
OpSyncRandomWriteVer uint8 = 0xB4
OpSyncRandomWriteVerRsp uint8 = 0xB5
OpTryWriteAppend uint8 = 0xB6
OpSyncTryWriteAppend uint8 = 0xB7
OpVersionOp uint8 = 0xB8
// Commons
OpNoSpaceErr uint8 = 0xEE
OpDirQuota uint8 = 0xF1
// Commons
OpConflictExtentsErr uint8 = 0xF2
OpIntraGroupNetErr uint8 = 0xF3
OpArgMismatchErr uint8 = 0xF4
OpNotExistErr uint8 = 0xF5
OpDiskNoSpaceErr uint8 = 0xF6
OpDiskErr uint8 = 0xF7
OpErr uint8 = 0xF8
OpAgain uint8 = 0xF9
OpExistErr uint8 = 0xFA
OpInodeFullErr uint8 = 0xFB
OpTryOtherAddr uint8 = 0xFC
OpNotPerm uint8 = 0xFD
OpNotEmpty uint8 = 0xFE
OpOk uint8 = 0xF0
OpAgainVerionList uint8 = 0xEF
OpPing uint8 = 0xFF
OpMetaUpdateXAttr uint8 = 0x3B
OpMetaReadDirOnly uint8 = 0x3C
OpUploadPartConflictErr uint8 = 0x3D
// ebs obj meta
OpMetaObjExtentAdd uint8 = 0xDD
OpMetaObjExtentsList uint8 = 0xDE
OpMetaExtentsEmpty uint8 = 0xDF
OpMetaBatchObjExtentsAdd uint8 = 0xD0
OpMetaClearInodeCache uint8 = 0xD1
OpMetaBatchSetXAttr uint8 = 0xD2
OpMetaGetAllXAttr uint8 = 0xD3
// transaction error
OpTxInodeInfoNotExistErr uint8 = 0xE0
OpTxConflictErr uint8 = 0xE1
OpTxDentryInfoNotExistErr uint8 = 0xE2
OpTxRbInodeNotExistErr uint8 = 0xE3
OpTxRbDentryNotExistErr uint8 = 0xE4
OpTxInfoNotExistErr uint8 = 0xE5
OpTxInternalErr uint8 = 0xE6
OpTxCommitItemErr uint8 = 0xE7
OpTxRollbackItemErr uint8 = 0xE8
OpTxRollbackUnknownRbType uint8 = 0xE9
OpTxTimeoutErr uint8 = 0xEA
OpTxSetStateErr uint8 = 0xEB
OpTxCommitErr uint8 = 0xEC
OpTxRollbackErr uint8 = 0xED
OpTxUnknownOp uint8 = 0xEE
// multiVersion to dp/mp
OpVersionOperation uint8 = 0xD5
OpSplitMarkDelete uint8 = 0xD6
OpTryOtherExtent uint8 = 0xD7
)
const (
WriteDeadlineTime = 5
ReadDeadlineTime = 5
SyncSendTaskDeadlineTime = 30
NoReadDeadlineTime = -1
BatchDeleteExtentReadDeadLineTime = 120
GetAllWatermarksDeadLineTime = 60
DefaultClusterLoadFactor float64 = 10
MultiVersionFlag = 0x80
VersionListFlag = 0x40
)
// multi version operation
const (
CreateVersion = 1
DeleteVersion = 2
CreateVersionPrepare = 3
CreateVersionCommit = 4
SyncBatchVersionList = 5
)
// stage of version building
const (
VersionInit = 0
VersionWorking = 1
VersionWorkingTimeOut = 2
VersionWorkingAbnormal = 3
VersionWorkingFinished = 4
)
// status of version
const (
VersionNormal = 1
VersionDeleted = 2
VersionDeleting = 3
VersionDeleteAbnormal = 4
VersionPrepare = 5
)
const (
TinyExtentType = 0
NormalExtentType = 1
)
const (
NormalCreateDataPartition = 0
DecommissionedCreateDataPartition = 1
)
// Packet defines the packet structure.
type Packet struct {
Magic uint8
ExtentType uint8 // the highest bit be set while rsp to client if version not consistent then Verseq be valid
Opcode uint8
ResultCode uint8
RemainingFollowers uint8
CRC uint32
Size uint32
ArgLen uint32
KernelOffset uint64
PartitionID uint64
ExtentID uint64
ExtentOffset int64
ReqID int64
Arg []byte // for create or append ops, the data contains the address
Data []byte
StartT int64
mesg string
HasPrepare bool
VerSeq uint64 // only used in mod request to datanode
VerList []*VolVersionInfo
}
func IsTinyExtentType(extentType uint8) bool {
return extentType&NormalExtentType != NormalExtentType
}
func IsNormalExtentType(extentType uint8) bool {
return extentType&NormalExtentType == NormalExtentType
}
// NewPacket returns a new packet.
func NewPacket() *Packet {
p := new(Packet)
p.Magic = ProtoMagic
p.StartT = time.Now().UnixNano()
return p
}
// NewPacketReqID returns a new packet with ReqID assigned.
func NewPacketReqID() *Packet {
p := NewPacket()
p.ReqID = GenerateRequestID()
return p
}
func (p *Packet) GetCopy() *Packet {
newPacket := NewPacket()
newPacket.ReqID = p.ReqID
newPacket.Opcode = p.Opcode
newPacket.PartitionID = p.PartitionID
newPacket.Data = make([]byte, p.Size)
copy(newPacket.Data[:p.Size], p.Data)
newPacket.Size = p.Size
return newPacket
}
func (p *Packet) String() string {
return fmt.Sprintf("ReqID(%v)Op(%v)PartitionID(%v)ResultCode(%v)ExID(%v)ExtOffset(%v)KernelOff(%v)Type(%v)Seq(%v)Size(%v)",
p.ReqID, p.GetOpMsg(), p.PartitionID, p.GetResultMsg(), p.ExtentID, p.ExtentOffset, p.KernelOffset, p.ExtentType, p.VerSeq, p.Size)
}
// GetStoreType returns the store type.
func (p *Packet) GetStoreType() (m string) {
if IsNormalExtentType(p.ExtentType) {
return "NormalExtent"
} else if IsTinyExtentType(p.ExtentType) {
return "TinyExtent"
} else {
return "Unknown"
}
}
func (p *Packet) GetOpMsgWithReqAndResult() (m string) {
return fmt.Sprintf("Req(%v)_(%v)_Result(%v)", p.ReqID, p.GetOpMsg(), p.GetResultMsg())
}
// GetOpMsg returns the operation type.
func (p *Packet) GetOpMsg() (m string) {
switch p.Opcode {
case OpCreateExtent:
m = "OpCreateExtent"
case OpMarkDelete:
m = "OpMarkDelete"
case OpSplitMarkDelete:
m = "OpMarkDelete"
case OpWrite:
m = "OpWrite"
case OpTryWriteAppend:
m = "OpTryWriteAppend"
case OpRandomWrite:
m = "OpRandomWrite"
case OpRandomWriteAppend:
m = "OpRandomWriteAppend"
case OpRandomWriteVer:
m = "OpRandomWriteVer"
case OpRead:
m = "Read"
case OpStreamRead:
m = "OpStreamRead"
case OpStreamFollowerRead:
m = "OpStreamFollowerRead"
case OpGetAllWatermarks:
m = "OpGetAllWatermarks"
case OpNotifyReplicasToRepair:
m = "OpNotifyReplicasToRepair"
case OpExtentRepairRead:
m = "OpExtentRepairRead"
case OpConflictExtentsErr:
m = "ConflictExtentsErr"
case OpIntraGroupNetErr:
m = "IntraGroupNetErr"
case OpMetaCreateInode:
m = "OpMetaCreateInode"
case OpQuotaCreateInode:
m = "OpQuotaCreateInode"
case OpMetaUnlinkInode:
m = "OpMetaUnlinkInode"
case OpMetaBatchUnlinkInode:
m = "OpMetaBatchUnlinkInode"
case OpMetaCreateDentry:
m = "OpMetaCreateDentry"
case OpQuotaCreateDentry:
m = "OpQuotaCreateDentry"
case OpMetaDeleteDentry:
m = "OpMetaDeleteDentry"
case OpMetaBatchDeleteDentry:
m = "OpMetaBatchDeleteDentry"
case OpMetaOpen:
m = "OpMetaOpen"
case OpMetaReleaseOpen:
m = "OpMetaReleaseOpen"
case OpMetaLookup:
m = "OpMetaLookup"
case OpMetaReadDir:
m = "OpMetaReadDir"
case OpMetaReadDirLimit:
m = "OpMetaReadDirLimit"
case OpMetaInodeGet:
m = "OpMetaInodeGet"
case OpMetaBatchInodeGet:
m = "OpMetaBatchInodeGet"
case OpMetaExtentsAdd:
m = "OpMetaExtentsAdd"
case OpMetaExtentAddWithCheck:
m = "OpMetaExtentAddWithCheck"
case OpMetaObjExtentAdd:
m = "OpMetaObjExtentAdd"
case OpMetaExtentsDel:
m = "OpMetaExtentsDel"
case OpMetaExtentsList:
m = "OpMetaExtentsList"
case OpMetaObjExtentsList:
m = "OpMetaObjExtentsList"
case OpMetaUpdateDentry:
m = "OpMetaUpdateDentry"
case OpMetaTruncate:
m = "OpMetaTruncate"
case OpMetaLinkInode:
m = "OpMetaLinkInode"
case OpMetaEvictInode:
m = "OpMetaEvictInode"
case OpMetaBatchEvictInode:
m = "OpMetaBatchEvictInode"
case OpMetaSetattr:
m = "OpMetaSetattr"
case OpCreateMetaPartition:
m = "OpCreateMetaPartition"
case OpMetaNodeHeartbeat:
m = "OpMetaNodeHeartbeat"
case OpDeleteMetaPartition:
m = "OpDeleteMetaPartition"
case OpUpdateMetaPartition:
m = "OpUpdateMetaPartition"
case OpLoadMetaPartition:
m = "OpLoadMetaPartition"
case OpDecommissionMetaPartition:
m = "OpDecommissionMetaPartition"
case OpCreateDataPartition:
m = "OpCreateDataPartition"
case OpDeleteDataPartition:
m = "OpDeleteDataPartition"
case OpLoadDataPartition:
m = "OpLoadDataPartition"
case OpDecommissionDataPartition:
m = "OpDecommissionDataPartition"
case OpDataNodeHeartbeat:
m = "OpDataNodeHeartbeat"
case OpReplicateFile:
m = "OpReplicateFile"
case OpDeleteFile:
m = "OpDeleteFile"
case OpGetAppliedId:
m = "OpGetAppliedId"
case OpGetPartitionSize:
m = "OpGetPartitionSize"
case OpSyncWrite:
m = "OpSyncWrite"
case OpSyncTryWriteAppend:
m = "OpSyncTryWriteAppend"
case OpSyncRandomWrite:
m = "OpSyncRandomWrite"
case OpSyncRandomWriteVer:
m = "OpSyncRandomWriteVer"
case OpSyncRandomWriteAppend:
m = "OpSyncRandomWriteAppend"
case OpReadTinyDeleteRecord:
m = "OpReadTinyDeleteRecord"
case OpPing:
m = "OpPing"
case OpTinyExtentRepairRead:
m = "OpTinyExtentRepairRead"
case OpGetMaxExtentIDAndPartitionSize:
m = "OpGetMaxExtentIDAndPartitionSize"
case OpBroadcastMinAppliedID:
m = "OpBroadcastMinAppliedID"
case OpRemoveDataPartitionRaftMember:
m = "OpRemoveDataPartitionRaftMember"
case OpAddDataPartitionRaftMember:
m = "OpAddDataPartitionRaftMember"
case OpAddMetaPartitionRaftMember:
m = "OpAddMetaPartitionRaftMember"
case OpRemoveMetaPartitionRaftMember:
m = "OpRemoveMetaPartitionRaftMember"
case OpMetaPartitionTryToLeader:
m = "OpMetaPartitionTryToLeader"
case OpDataPartitionTryToLeader:
m = "OpDataPartitionTryToLeader"
case OpMetaDeleteInode:
m = "OpMetaDeleteInode"
case OpMetaBatchDeleteInode:
m = "OpMetaBatchDeleteInode"
case OpMetaBatchExtentsAdd:
m = "OpMetaBatchExtentsAdd"
case OpMetaBatchObjExtentsAdd:
m = "OpMetaBatchObjExtentsAdd"
case OpMetaSetXAttr:
m = "OpMetaSetXAttr"
case OpMetaGetXAttr:
m = "OpMetaGetXAttr"
case OpMetaRemoveXAttr:
m = "OpMetaRemoveXAttr"
case OpMetaListXAttr:
m = "OpMetaListXAttr"
case OpMetaBatchGetXAttr:
m = "OpMetaBatchGetXAttr"
case OpMetaUpdateXAttr:
m = "OpMetaUpdateXAttr"
case OpCreateMultipart:
m = "OpCreateMultipart"
case OpGetMultipart:
m = "OpGetMultipart"
case OpAddMultipartPart:
m = "OpAddMultipartPart"
case OpRemoveMultipart:
m = "OpRemoveMultipart"
case OpListMultiparts:
m = "OpListMultiparts"
case OpBatchDeleteExtent:
m = "OpBatchDeleteExtent"
case OpMetaClearInodeCache:
m = "OpMetaClearInodeCache"
case OpMetaTxCreateInode:
m = "OpMetaTxCreateInode"
case OpMetaTxCreateDentry:
m = "OpMetaTxCreateDentry"
case OpTxCommit:
m = "OpTxCommit"
case OpMetaTxCreate:
m = "OpMetaTxCreate"
case OpTxRollback:
m = "OpTxRollback"
case OpTxCommitRM:
m = "OpTxCommitRM"
case OpTxRollbackRM:
m = "OpTxRollbackRM"
case OpMetaTxDeleteDentry:
m = "OpMetaTxDeleteDentry"
case OpMetaTxUnlinkInode:
m = "OpMetaTxUnlinkInode"
case OpMetaTxUpdateDentry:
m = "OpMetaTxUpdateDentry"
case OpMetaTxLinkInode:
m = "OpMetaTxLinkInode"
case OpMetaTxGet:
m = "OpMetaTxGet"
case OpMetaBatchSetInodeQuota:
m = "OpMetaBatchSetInodeQuota"
case OpMetaBatchDeleteInodeQuota:
m = "OpMetaBatchDeleteInodeQuota"
case OpMetaGetInodeQuota:
m = "OpMetaGetInodeQuota"
case OpStopDataPartitionRepair:
m = "OpStopDataPartitionRepair"
case OpLcNodeHeartbeat:
m = "OpLcNodeHeartbeat"
case OpLcNodeScan:
m = "OpLcNodeScan"
case OpLcNodeSnapshotVerDel:
m = "OpLcNodeSnapshotVerDel"
case OpMetaReadDirOnly:
m = "OpMetaReadDirOnly"
default:
m = fmt.Sprintf("op:%v not found", p.Opcode)
}
return
}
func GetStatusStr(status uint8) string {
pkt := &Packet{}
pkt.ResultCode = status
return pkt.GetResultMsg()
}
// GetResultMsg returns the result message.
func (p *Packet) GetResultMsg() (m string) {
if p == nil {
return ""
}
switch p.ResultCode {
case OpConflictExtentsErr:
m = "ConflictExtentsErr"
case OpIntraGroupNetErr:
m = "IntraGroupNetErr"
case OpDiskNoSpaceErr:
m = "DiskNoSpaceErr"
case OpDiskErr:
m = "DiskErr"
case OpErr:
m = "Err: " + string(p.Data)
case OpAgain:
m = "Again: " + string(p.Data)
case OpOk:
m = "Ok"
case OpExistErr:
m = "ExistErr"
case OpInodeFullErr:
m = "InodeFullErr"
case OpArgMismatchErr:
m = "ArgUnmatchErr"
case OpNotExistErr:
m = "NotExistErr"
case OpTryOtherAddr:
m = "TryOtherAddr"
case OpNotPerm:
m = "NotPerm"
case OpNotEmpty:
m = "DirNotEmpty"
case OpDirQuota:
m = "OpDirQuota"
case OpNoSpaceErr:
m = "NoSpaceErr"
case OpTxInodeInfoNotExistErr:
m = "OpTxInodeInfoNotExistErr"
case OpTxConflictErr:
m = "TransactionConflict"
case OpTxDentryInfoNotExistErr:
m = "OpTxDentryInfoNotExistErr"
case OpTxRbInodeNotExistErr:
m = "OpTxRbInodeNotExistEr"
case OpTxRbDentryNotExistErr:
m = "OpTxRbDentryNotExistEr"
case OpTxInfoNotExistErr:
m = "OpTxInfoNotExistErr"
case OpTxInternalErr:
m = "OpTxInternalErr"
case OpTxCommitItemErr:
m = "OpTxCommitItemErr"
case OpTxRollbackItemErr:
m = "OpTxRollbackItemErr"
case OpTxRollbackUnknownRbType:
m = "OpTxRollbackUnknownRbType"
case OpTxTimeoutErr:
m = "OpTxTimeoutErr"
case OpTxSetStateErr:
m = "OpTxSetStateErr"
case OpTxCommitErr:
m = "OpTxCommitErr"
case OpTxRollbackErr:
m = "OpTxRollbackErr"
case OpUploadPartConflictErr:
m = "OpUploadPartConflictErr"
default:
return fmt.Sprintf("Unknown ResultCode(%v)", p.ResultCode)
}
return
}
func (p *Packet) GetReqID() int64 {
return p.ReqID
}
// MarshalHeader marshals the packet header.
func (p *Packet) MarshalHeader(out []byte) {
out[0] = p.Magic
out[1] = p.ExtentType
out[2] = p.Opcode
out[3] = p.ResultCode
out[4] = p.RemainingFollowers
binary.BigEndian.PutUint32(out[5:9], p.CRC)
binary.BigEndian.PutUint32(out[9:13], p.Size)
binary.BigEndian.PutUint32(out[13:17], p.ArgLen)
binary.BigEndian.PutUint64(out[17:25], p.PartitionID)
binary.BigEndian.PutUint64(out[25:33], p.ExtentID)
binary.BigEndian.PutUint64(out[33:41], uint64(p.ExtentOffset))
binary.BigEndian.PutUint64(out[41:49], uint64(p.ReqID))
binary.BigEndian.PutUint64(out[49:util.PacketHeaderSize], p.KernelOffset)
if p.Opcode == OpRandomWriteVer || p.ExtentType&MultiVersionFlag > 0 {
binary.BigEndian.PutUint64(out[util.PacketHeaderSize:util.PacketHeaderSize+8], p.VerSeq)
}
}
func (p *Packet) IsVersionList() bool {
return p.ExtentType&VersionListFlag == VersionListFlag
}
// UnmarshalHeader unmarshals the packet header.
func (p *Packet) UnmarshalHeader(in []byte) error {
p.Magic = in[0]
if p.Magic != ProtoMagic {
return errors.New("Bad Magic " + strconv.Itoa(int(p.Magic)))
}
p.ExtentType = in[1]
p.Opcode = in[2]
p.ResultCode = in[3]
p.RemainingFollowers = in[4]
p.CRC = binary.BigEndian.Uint32(in[5:9])
p.Size = binary.BigEndian.Uint32(in[9:13])
p.ArgLen = binary.BigEndian.Uint32(in[13:17])
p.PartitionID = binary.BigEndian.Uint64(in[17:25])
p.ExtentID = binary.BigEndian.Uint64(in[25:33])
p.ExtentOffset = int64(binary.BigEndian.Uint64(in[33:41]))
p.ReqID = int64(binary.BigEndian.Uint64(in[41:49]))
p.KernelOffset = binary.BigEndian.Uint64(in[49:util.PacketHeaderSize])
// header opcode OpRandomWriteVer should not unmarshal here due to header size is const
// the ver param should read at the higher level directly
// if p.Opcode ==OpRandomWriteVer {
return nil
}
const verInfoCnt = 17
func (p *Packet) MarshalVersionSlice() (data []byte, err error) {
items := p.VerList
cnt := len(items)
buff := bytes.NewBuffer(make([]byte, 0, 2*cnt*verInfoCnt))
if err := binary.Write(buff, binary.BigEndian, uint16(cnt)); err != nil {
return nil, err
}
for _, v := range items {
if err := binary.Write(buff, binary.BigEndian, v.Ver); err != nil {
return nil, err
}
if err := binary.Write(buff, binary.BigEndian, v.DelTime); err != nil {
return nil, err
}
if err := binary.Write(buff, binary.BigEndian, v.Status); err != nil {
return nil, err
}
}
return buff.Bytes(), nil
}
func (p *Packet) UnmarshalVersionSlice(cnt int, d []byte) error {
items := make([]*VolVersionInfo, 0)
buf := bytes.NewBuffer(d)
var err error
for idx := 0; idx < cnt; idx++ {
e := &VolVersionInfo{}
err = binary.Read(buf, binary.BigEndian, &e.Ver)
if err != nil {
return err
}
err = binary.Read(buf, binary.BigEndian, &e.DelTime)
if err != nil {
return err
}
err = binary.Read(buf, binary.BigEndian, &e.Status)
if err != nil {
return err
}
items = append(items, e)
}
p.VerList = items
return nil
}
// MarshalData marshals the packet data.
func (p *Packet) MarshalData(v interface{}) error {
data, err := json.Marshal(v)
if err == nil {
p.Data = data
p.Size = uint32(len(p.Data))
}
return err
}
// UnmarshalData unmarshals the packet data.
func (p *Packet) UnmarshalData(v interface{}) error {
return json.Unmarshal(p.Data, v)
}
// WriteToNoDeadLineConn writes through the connection without deadline.
func (p *Packet) WriteToNoDeadLineConn(c net.Conn) (err error) {
header, err := Buffers.Get(util.PacketHeaderSize)
if err != nil {
header = make([]byte, util.PacketHeaderSize)
}
defer Buffers.Put(header)
p.MarshalHeader(header)
if _, err = c.Write(header); err == nil {
if _, err = c.Write(p.Arg[:int(p.ArgLen)]); err == nil {
if p.Data != nil {
_, err = c.Write(p.Data[:p.Size])
}
}
}
return
}
// WriteToConn writes through the given connection.
func (p *Packet) WriteToConn(c net.Conn) (err error) {
headSize := util.PacketHeaderSize
if p.Opcode == OpRandomWriteVer || p.ExtentType&MultiVersionFlag > 0 {
headSize = util.PacketHeaderVerSize
}
// log.LogDebugf("packet opcode %v header size %v extentype %v conn %v", p.Opcode, headSize, p.ExtentType, c)
header, err := Buffers.Get(headSize)
if err != nil {
header = make([]byte, headSize)
}
// log.LogErrorf("action[WriteToConn] buffer get nil,opcode %v head len [%v]", p.Opcode, len(header))
defer Buffers.Put(header)
c.SetWriteDeadline(time.Now().Add(WriteDeadlineTime * time.Second))
p.MarshalHeader(header)
if _, err = c.Write(header); err == nil {
// write dir version info.
if p.IsVersionList() {
d, err1 := p.MarshalVersionSlice()
if err1 != nil {
log.LogErrorf("MarshalVersionSlice: marshal version ifo failed, err %s", err1.Error())
return err1
}
_, err = c.Write(d)
if err != nil {
return err
}
}
if _, err = c.Write(p.Arg[:int(p.ArgLen)]); err == nil {
if p.Data != nil && p.Size != 0 {
_, err = c.Write(p.Data[:p.Size])
}
}
}
return
}
// ReadFull is a wrapper function of io.ReadFull.
func ReadFull(c net.Conn, buf *[]byte, readSize int) (err error) {
*buf = make([]byte, readSize)
_, err = io.ReadFull(c, (*buf)[:readSize])
return
}
func (p *Packet) IsWriteOperation() bool {
return p.Opcode == OpWrite || p.Opcode == OpSyncWrite
}
func (p *Packet) IsReadOperation() bool {
return p.Opcode == OpStreamRead || p.Opcode == OpRead ||
p.Opcode == OpExtentRepairRead || p.Opcode == OpReadTinyDeleteRecord ||
p.Opcode == OpTinyExtentRepairRead || p.Opcode == OpStreamFollowerRead
}
// ReadFromConn reads the data from the given connection.
// Recognize the version bit and parse out version,
// to avoid version field rsp back , the rsp of random write from datanode with replace OpRandomWriteVer to OpRandomWriteVerRsp
func (p *Packet) ReadFromConnWithVer(c net.Conn, timeoutSec int) (err error) {
if timeoutSec != NoReadDeadlineTime {
c.SetReadDeadline(time.Now().Add(time.Second * time.Duration(timeoutSec)))
} else {
c.SetReadDeadline(time.Time{})
}
header, err := Buffers.Get(util.PacketHeaderSize)
if err != nil {
header = make([]byte, util.PacketHeaderSize)
}
defer Buffers.Put(header)
var n int
if n, err = io.ReadFull(c, header); err != nil {
return
}
if n != util.PacketHeaderSize {
return syscall.EBADMSG
}
if err = p.UnmarshalHeader(header); err != nil {
return
}
if p.ExtentType&MultiVersionFlag > 0 {
ver := make([]byte, 8)
if _, err = io.ReadFull(c, ver); err != nil {
return
}
p.VerSeq = binary.BigEndian.Uint64(ver)
}
if p.IsVersionList() {
cntByte := make([]byte, 2)
if _, err = io.ReadFull(c, cntByte); err != nil {
return err
}
cnt := binary.BigEndian.Uint16(cntByte)
log.LogDebugf("action[ReadFromConnWithVer] op %s verseq %v, extType %d, cnt %d",
p.GetOpMsg(), p.VerSeq, p.ExtentType, cnt)
verData := make([]byte, cnt*verInfoCnt)
if _, err = io.ReadFull(c, verData); err != nil {
log.LogWarnf("ReadFromConnWithVer: read ver slice from conn failed, err %s", err.Error())
return err
}
err = p.UnmarshalVersionSlice(int(cnt), verData)
if err != nil {
log.LogWarnf("ReadFromConnWithVer: unmarshal ver slice failed, err %s", err.Error())
return err
}
}
if p.ArgLen > 0 {
p.Arg = make([]byte, int(p.ArgLen))
if _, err = io.ReadFull(c, p.Arg[:int(p.ArgLen)]); err != nil {
return err
}
}
size := p.Size
if p.IsReadOperation() && p.ResultCode == OpInitResultCode {
size = 0
}
if p.IsWriteOperation() && size == util.BlockSize {
p.Data, _ = Buffers.Get(int(size))
} else {
p.Data = make([]byte, size)
}
if n, err = io.ReadFull(c, p.Data[:size]); err != nil {
return err
}
if n != int(size) {
return syscall.EBADMSG
}
return nil
}
// ReadFromConn reads the data from the given connection.
func (p *Packet) ReadFromConn(c net.Conn, timeoutSec int) (err error) {
if timeoutSec != NoReadDeadlineTime {
c.SetReadDeadline(time.Now().Add(time.Second * time.Duration(timeoutSec)))
} else {
c.SetReadDeadline(time.Time{})
}
header, err := Buffers.Get(util.PacketHeaderSize)
if err != nil {
header = make([]byte, util.PacketHeaderSize)
}
defer Buffers.Put(header)
var n int
if n, err = io.ReadFull(c, header); err != nil {
return
}
if n != util.PacketHeaderSize {
return syscall.EBADMSG
}
if err = p.UnmarshalHeader(header); err != nil {
return
}
if p.ArgLen > 0 {
p.Arg = make([]byte, int(p.ArgLen))
if _, err = io.ReadFull(c, p.Arg[:int(p.ArgLen)]); err != nil {
return err
}
}
size := p.Size
if (p.Opcode == OpRead || p.Opcode == OpStreamRead || p.Opcode == OpExtentRepairRead || p.Opcode == OpStreamFollowerRead) && p.ResultCode == OpInitResultCode {
size = 0
}
p.Data = make([]byte, size)
if n, err = io.ReadFull(c, p.Data[:size]); err != nil {
return err
}
if n != int(size) {
return syscall.EBADMSG
}
return nil
}
// PacketOkReply sets the result code as OpOk, and sets the body as empty.
func (p *Packet) PacketOkReply() {
p.ResultCode = OpOk
p.Size = 0
p.Data = nil
p.ArgLen = 0
}
// PacketOkWithBody sets the result code as OpOk, and sets the body with the give data.
func (p *Packet) PacketOkWithBody(reply []byte) {
p.Size = uint32(len(reply))
p.Data = make([]byte, p.Size)
copy(p.Data[:p.Size], reply)
p.ResultCode = OpOk
p.ArgLen = 0
}
// attention use for tmp byte arr, eg: json marshal data
func (p *Packet) PacketOkWithByte(reply []byte) {
p.Size = uint32(len(reply))
p.Data = reply
p.ResultCode = OpOk
p.ArgLen = 0
}
// PacketErrorWithBody sets the packet with error code whose body is filled with the given data.
func (p *Packet) PacketErrorWithBody(code uint8, reply []byte) {
p.Size = uint32(len(reply))
p.Data = make([]byte, p.Size)
copy(p.Data[:p.Size], reply)
p.ResultCode = code
p.ArgLen = 0
}
func (p *Packet) SetPacketHasPrepare() {
p.setPacketPrefix()
p.HasPrepare = true
}
func (p *Packet) SetPacketRePrepare() {
p.HasPrepare = false
}
func (p *Packet) AddMesgLog(m string) {
p.mesg += m
}
// GetUniqueLogId returns the unique log ID.
func (p *Packet) GetUniqueLogId() (m string) {
defer func() {
m = m + fmt.Sprintf("_ResultMesg(%v)", p.GetResultMsg())
}()
if p.HasPrepare {
m = p.mesg
return
}
m = fmt.Sprintf("Req(%v)_Partition(%v)_", p.ReqID, p.PartitionID)
if p.Opcode == OpSplitMarkDelete || (IsTinyExtentType(p.ExtentType) && p.Opcode == OpMarkDelete) && len(p.Data) > 0 {
ext := new(TinyExtentDeleteRecord)
err := json.Unmarshal(p.Data, ext)
if err == nil {
m += fmt.Sprintf("Extent(%v)_ExtentOffset(%v)_Size(%v)_Opcode(%v)",
ext.ExtentId, ext.ExtentOffset, ext.Size, p.GetOpMsg())
return m
}
} else if p.Opcode == OpReadTinyDeleteRecord || p.Opcode == OpNotifyReplicasToRepair || p.Opcode == OpDataNodeHeartbeat ||
p.Opcode == OpLoadDataPartition || p.Opcode == OpBatchDeleteExtent {
p.mesg += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
return
} else if p.Opcode == OpBroadcastMinAppliedID || p.Opcode == OpGetAppliedId {
if p.Size > 0 {
applyID := binary.BigEndian.Uint64(p.Data)
m += fmt.Sprintf("Opcode(%v)_AppliedID(%v)", p.GetOpMsg(), applyID)
} else {
m += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
}
return m
}
m = fmt.Sprintf("Req(%v)_Partition(%v)_Extent(%v)_ExtentOffset(%v)_KernelOffset(%v)_"+
"Size(%v)_Opcode(%v)_CRC(%v)",
p.ReqID, p.PartitionID, p.ExtentID, p.ExtentOffset,
p.KernelOffset, p.Size, p.GetOpMsg(), p.CRC)
return
}
func (p *Packet) setPacketPrefix() {
p.mesg = fmt.Sprintf("Req(%v)_Partition(%v)_", p.ReqID, p.PartitionID)
if (p.Opcode == OpSplitMarkDelete || (IsTinyExtentType(p.ExtentType) && p.Opcode == OpMarkDelete)) && len(p.Data) > 0 {
ext := new(TinyExtentDeleteRecord)
err := json.Unmarshal(p.Data, ext)
if err == nil {
p.mesg += fmt.Sprintf("Extent(%v)_ExtentOffset(%v)_Size(%v)_Opcode(%v)",
ext.ExtentId, ext.ExtentOffset, ext.Size, p.GetOpMsg())
return
}
} else if p.Opcode == OpReadTinyDeleteRecord || p.Opcode == OpNotifyReplicasToRepair || p.Opcode == OpDataNodeHeartbeat ||
p.Opcode == OpLoadDataPartition || p.Opcode == OpBatchDeleteExtent {
p.mesg += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
return
} else if p.Opcode == OpBroadcastMinAppliedID || p.Opcode == OpGetAppliedId {
if p.Size > 0 {
applyID := binary.BigEndian.Uint64(p.Data)
p.mesg += fmt.Sprintf("Opcode(%v)_AppliedID(%v)", p.GetOpMsg(), applyID)
} else {
p.mesg += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
}
return
}
p.mesg = fmt.Sprintf("Req(%v)_Partition(%v)_Extent(%v)_ExtentOffset(%v)_KernelOffset(%v)_"+
"Size(%v)_Opcode(%v)_CRC(%v)",
p.ReqID, p.PartitionID, p.ExtentID, p.ExtentOffset,
p.KernelOffset, p.Size, p.GetOpMsg(), p.CRC)
}
// IsForwardPkt returns if the packet is the forward packet (a packet that will be forwarded to the followers).
func (p *Packet) IsForwardPkt() bool {
return p.RemainingFollowers > 0
}
// LogMessage logs the given message.
func (p *Packet) LogMessage(action, remote string, start int64, err error) (m string) {
if err == nil {
m = fmt.Sprintf("id[%v] isPrimaryBackReplLeader[%v] remote[%v] "+
" cost[%v] ", p.GetUniqueLogId(), p.IsForwardPkt(), remote, (time.Now().UnixNano()-start)/1e6)
} else {
m = fmt.Sprintf("id[%v] isPrimaryBackReplLeader[%v] remote[%v]"+
", err[%v]", p.GetUniqueLogId(), p.IsForwardPkt(), remote, err.Error())
}
return
}
// ShallRetry returns if we should retry the packet.
func (p *Packet) ShouldRetryWithVersionList() bool {
return p.ResultCode == OpAgainVerionList
}
// ShallRetry returns if we should retry the packet.
func (p *Packet) ShouldRetry() bool {
return p.ResultCode == OpAgain || p.ResultCode == OpErr
}
func (p *Packet) IsBatchDeleteExtents() bool {
return p.Opcode == OpBatchDeleteExtent
}
func InitBufferPool(bufLimit int64) {
buf.NormalBuffersTotalLimit = bufLimit
buf.HeadBuffersTotalLimit = bufLimit
buf.HeadVerBuffersTotalLimit = bufLimit
Buffers = buf.NewBufferPool()
}
// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"path"
"regexp"
"strings"
)
var (
actionRegexp = regexp.MustCompile(`^action:((oss:(\w+))|(posix:(\w)+))$`)
actionPrefixRegexp = regexp.MustCompile(`^action:((oss)|(posix)):`)
)
type Action string
func (a Action) String() string {
return string(a)
}
func (a Action) IsNone() bool {
return len(a) == 0 || a == NoneAction
}
func (a Action) Name() string {
loc := actionPrefixRegexp.FindStringIndex(a.String())
if len(loc) != 2 {
return "Unknown"
}
return a.String()[loc[1]:]
}
const (
ActionPrefix = "action:"
OSSActionPrefix = ActionPrefix + "oss:"
POSIXActionPrefix = ActionPrefix + "posix:"
// Object actions
OSSGetObjectAction Action = OSSActionPrefix + "GetObject"
OSSPutObjectAction Action = OSSActionPrefix + "PutObject"
OSSPostObjectAction Action = OSSActionPrefix + "PostObject"
OSSCopyObjectAction Action = OSSActionPrefix + "CopyObject"
OSSListObjectsAction Action = OSSActionPrefix + "ListObjects"
OSSDeleteObjectAction Action = OSSActionPrefix + "DeleteObject"
OSSDeleteObjectsAction Action = OSSActionPrefix + "DeleteObjects"
OSSHeadObjectAction Action = OSSActionPrefix + "HeadObject"
// Bucket actions
OSSCreateBucketAction Action = OSSActionPrefix + "CreateBucket"
OSSDeleteBucketAction Action = OSSActionPrefix + "DeleteBucket"
OSSHeadBucketAction Action = OSSActionPrefix + "HeadBucket"
OSSListBucketsAction Action = OSSActionPrefix + "ListBuckets"
// Bucket policy actions
OSSGetBucketPolicyAction Action = OSSActionPrefix + "GetBucketPolicy"
OSSPutBucketPolicyAction Action = OSSActionPrefix + "PutBucketPolicy"
OSSDeleteBucketPolicyAction Action = OSSActionPrefix + "DeleteBucketPolicy"
OSSGetBucketPolicyStatusAction Action = OSSActionPrefix + "GetBucketPolicyStatus" // unsupported
// Bucket ACL actions
OSSGetBucketAclAction Action = OSSActionPrefix + "GetBucketAcl"
OSSPutBucketAclAction Action = OSSActionPrefix + "PutBucketAcl"
// Bucket CORS actions
OSSGetBucketCorsAction Action = OSSActionPrefix + "GetBucketCors"
OSSPutBucketCorsAction Action = OSSActionPrefix + "PutBucketCors"
OSSDeleteBucketCorsAction Action = OSSActionPrefix + "DeleteBucketCors"
OSSOptionsObjectAction Action = OSSActionPrefix + "OptionsObject"
// Object torrent actions
OSSGetObjectTorrentAction Action = OSSActionPrefix + "GetObjectTorrent" // unsupported
// Object ACL actions
OSSGetObjectAclAction Action = OSSActionPrefix + "GetObjectAcl"
OSSPutObjectAclAction Action = OSSActionPrefix + "PutObjectAcl"
// Multipart actions
OSSCreateMultipartUploadAction Action = OSSActionPrefix + "CreateMultipartUpload"
OSSListMultipartUploadsAction Action = OSSActionPrefix + "ListMultipartUploads"
OSSUploadPartAction Action = OSSActionPrefix + "UploadPart"
OSSUploadPartCopyAction Action = OSSActionPrefix + "UploadPartCopy" // unsupported
OSSListPartsAction Action = OSSActionPrefix + "ListParts"
OSSCompleteMultipartUploadAction Action = OSSActionPrefix + "CompleteMultipartUpload"
OSSAbortMultipartUploadAction Action = OSSActionPrefix + "AbortMultipartUpload"
// Bucket location
OSSGetBucketLocationAction Action = OSSActionPrefix + "GetBucketLocation"
// Object extend attributes (xattr)
OSSGetObjectXAttrAction Action = OSSActionPrefix + "GetObjectXAttr"
OSSPutObjectXAttrAction Action = OSSActionPrefix + "PutObjectXAttr"
OSSListObjectXAttrsAction Action = OSSActionPrefix + "ListObjectXAttrs"
OSSDeleteObjectXAttrAction Action = OSSActionPrefix + "DeleteObjectXAttr"
// Object tagging actions
OSSGetObjectTaggingAction Action = OSSActionPrefix + "GetObjectTagging"
OSSPutObjectTaggingAction Action = OSSActionPrefix + "PutObjectTagging"
OSSDeleteObjectTaggingAction Action = OSSActionPrefix + "DeleteObjectTagging"
// Bucket tagging actions
OSSGetBucketTaggingAction Action = OSSActionPrefix + "GetBucketTagging"
OSSPutBucketTaggingAction Action = OSSActionPrefix + "PutBucketTagging"
OSSDeleteBucketTaggingAction Action = OSSActionPrefix + "DeleteBucketTagging"
// Bucket lifecycle actions
OSSGetBucketLifecycleAction Action = OSSActionPrefix + "GetBucketLifecycle" // unsupported
OSSPutBucketLifecycleAction Action = OSSActionPrefix + "PutBucketLifecycle" // unsupported
OSSDeleteBucketLifecycleAction Action = OSSActionPrefix + "DeleteBucketLifecycle" // unsupported
OSSGetBucketLifecycleConfigurationAction Action = OSSActionPrefix + "GetBucketLifecycleConfiguration"
OSSPutBucketLifecycleConfigurationAction Action = OSSActionPrefix + "PutBucketLifecycleConfiguration"
OSSDeleteBucketLifecycleConfigurationAction Action = OSSActionPrefix + "DeleteBucketLifecycleConfiguration"
// Object storage version actions
OSSGetBucketVersioningAction Action = OSSActionPrefix + "GetBucketVersioning" // unsupported
OSSPutBucketVersioningAction Action = OSSActionPrefix + "PutBucketVersioning" // unsupported
OSSListObjectVersionsAction Action = OSSActionPrefix + "ListObjectVersions" // unsupported
// Object legal hold actions
OSSGetObjectLegalHoldAction Action = OSSActionPrefix + "GetObjectLegalHold" // unsupported
OSSPutObjectLegalHoldAction Action = OSSActionPrefix + "PutObjectLegalHold" // unsupported
// Object retention actions
OSSGetObjectRetentionAction Action = OSSActionPrefix + "GetObjectRetention" // unsupported
OSSPutObjectRetentionAction Action = OSSActionPrefix + "PutObjectRetention" // unsupported
// Bucket encryption actions
OSSGetBucketEncryptionAction Action = OSSActionPrefix + "GetBucketEncryption" // unsupported
OSSPutBucketEncryptionAction Action = OSSActionPrefix + "PutBucketEncryption" // unsupported
OSSDeleteBucketEncryptionAction Action = OSSActionPrefix + "DeleteBucketEncryption" // unsupported
// Bucket website actions
OSSGetBucketWebsiteAction Action = OSSActionPrefix + "GetBucketWebsite" // unsupported
OSSPutBucketWebsiteAction Action = OSSActionPrefix + "PutBucketWebsite" // unsupported
OSSDeleteBucketWebsiteAction Action = OSSActionPrefix + "DeleteBucketWebsite" // unsupported
// Object restore actions
OSSRestoreObjectAction Action = OSSActionPrefix + "RestoreObject" // unsupported
// Public access block actions
OSSGetPublicAccessBlockAction Action = OSSActionPrefix + "GetPublicAccessBlock" // unsupported
OSSPutPublicAccessBlockAction Action = OSSActionPrefix + "PutPublicAccessBlock" // unsupported
OSSDeletePublicAccessBlockAction Action = OSSActionPrefix + "DeletePulicAccessBlock" // unuspported
// Bucket request payment actions
OSSGetBucketRequestPaymentAction Action = OSSActionPrefix + "GetBucketRequestPayment" // unsupported
OSSPutBucketRequestPaymentAction Action = OSSActionPrefix + "PutBucketRequestPayment" // unsupported
// Bucket replication actions
OSSGetBucketReplicationAction Action = OSSActionPrefix + "GetBucketReplicationAction" // unsupported
OSSPutBucketReplicationAction Action = OSSActionPrefix + "PutBucketReplicationAction" // unsupported
OSSDeleteBucketReplicationAction Action = OSSActionPrefix + "DeleteBucketReplicationAction" // unsupported
// STS actions
OSSGetFederationTokenAction Action = OSSActionPrefix + "GetFederationToken"
// constants for POSIX file system interface
POSIXReadAction Action = POSIXActionPrefix + "Read"
POSIXWriteAction Action = POSIXActionPrefix + "Write"
// Object Lock actions
OSSPutObjectLockConfigurationAction Action = OSSActionPrefix + "PutObjectLockConfiguration"
OSSGetObjectLockConfigurationAction Action = OSSActionPrefix + "GetObjectLockConfiguration"
NoneAction Action = ""
)
var AllActions = []Action{
// Object storage interface actions
OSSGetObjectAction,
OSSPutObjectAction,
OSSPostObjectAction,
OSSCopyObjectAction,
OSSListObjectsAction,
OSSDeleteObjectAction,
OSSDeleteObjectsAction,
OSSHeadObjectAction,
OSSCreateBucketAction,
OSSDeleteBucketAction,
OSSHeadBucketAction,
OSSListBucketsAction,
OSSGetBucketPolicyAction,
OSSPutBucketPolicyAction,
OSSDeleteBucketPolicyAction,
OSSGetBucketPolicyStatusAction,
OSSGetBucketAclAction,
OSSPutBucketAclAction,
OSSGetObjectTorrentAction,
OSSGetObjectAclAction,
OSSPutObjectAclAction,
OSSCreateMultipartUploadAction,
OSSListMultipartUploadsAction,
OSSUploadPartAction,
OSSUploadPartCopyAction,
OSSListPartsAction,
OSSCompleteMultipartUploadAction,
OSSAbortMultipartUploadAction,
OSSGetBucketLocationAction,
OSSGetObjectXAttrAction,
OSSPutObjectXAttrAction,
OSSListObjectXAttrsAction,
OSSDeleteObjectXAttrAction,
OSSGetObjectTaggingAction,
OSSPutObjectTaggingAction,
OSSDeleteObjectTaggingAction,
OSSGetBucketTaggingAction,
OSSPutBucketTaggingAction,
OSSDeleteBucketTaggingAction,
OSSGetBucketLifecycleAction,
OSSPutBucketLifecycleAction,
OSSDeleteBucketLifecycleAction,
OSSGetBucketLifecycleConfigurationAction,
OSSPutBucketLifecycleConfigurationAction,
OSSDeleteBucketLifecycleConfigurationAction,
OSSGetBucketVersioningAction,
OSSPutBucketVersioningAction,
OSSListObjectVersionsAction,
OSSGetObjectLegalHoldAction,
OSSPutObjectLegalHoldAction,
OSSGetObjectRetentionAction,
OSSPutObjectRetentionAction,
OSSGetBucketEncryptionAction,
OSSPutBucketEncryptionAction,
OSSDeleteBucketEncryptionAction,
OSSGetBucketCorsAction,
OSSPutBucketCorsAction,
OSSDeleteBucketCorsAction,
OSSGetBucketWebsiteAction,
OSSPutBucketWebsiteAction,
OSSDeleteBucketWebsiteAction,
OSSRestoreObjectAction,
OSSGetPublicAccessBlockAction,
OSSPutPublicAccessBlockAction,
OSSDeletePublicAccessBlockAction,
OSSGetBucketRequestPaymentAction,
OSSPutBucketRequestPaymentAction,
OSSGetBucketReplicationAction,
OSSPutBucketReplicationAction,
OSSDeleteBucketReplicationAction,
OSSOptionsObjectAction,
OSSGetFederationTokenAction,
// POSIX file system interface actions
POSIXReadAction,
POSIXWriteAction,
OSSPutObjectLockConfigurationAction,
OSSGetObjectLockConfigurationAction,
}
func ParseAction(str string) Action {
if len(str) == 0 || !actionRegexp.MatchString(str) {
return NoneAction
}
for _, act := range AllActions {
if act.String() == str {
return act
}
}
return NoneAction
}
type Actions []Action
func (actions Actions) Contains(action Action) bool {
if len(actions) == 0 {
return false
}
for _, a := range actions {
if a == action {
return true
}
}
return false
}
func (actions Actions) Len() int {
return len(actions)
}
type Permission string
func (p Permission) String() string {
return string(p)
}
func (p Permission) ReadableString() string {
if p.Valid() {
if p.IsBuiltin() {
return p.String()[len(BuiltinPermissionPrefix.String()):] + "(builtin)"
}
if p.IsCustom() {
return p.String()[len(CustomPermissionPrefix.String()):] + "(custom)"
}
return p.String()
}
return "None"
}
func (p Permission) IsBuiltin() bool {
return builtinPermRegexp.MatchString(string(p))
}
func (p Permission) MatchSubdir(subdir string) bool {
if !strings.HasPrefix(string(p), string(BuiltinPermissionPrefix)) {
return false
}
s := strings.TrimPrefix(string(p), string(BuiltinPermissionPrefix))
if !subdirRegexp.MatchString(s) {
return true
}
pars := strings.Split(s, ":")
pars = pars[:len(pars)-1] // trim (Writable|ReadOnly) at the end
for _, toCmp := range pars {
if toCmp == "/" || toCmp == "" {
return true
}
subdir = path.Clean("/" + subdir)
toCmp = path.Clean("/" + toCmp)
if strings.HasPrefix(subdir, toCmp) {
tail := strings.TrimPrefix(subdir, toCmp)
// match case 1:
// subdir = "/a/b/c"
// toCmp = "/a/b/c"
// tail = ""
// match case 2:
// subdir = "/a/b/c"
// toCmp = "/a/b"
// tail = "/c"
if tail == "" || strings.HasPrefix(tail, "/") {
return true
}
}
}
return false
}
func (p Permission) IsCustom() bool {
return customPermRegexp.MatchString(string(p))
}
func (p Permission) Valid() bool {
return permRegexp.MatchString(string(p))
}
func (p Permission) IsNone() bool {
return p == NonePermission
}
const (
// prefixes for value organization
PermissionPrefix Permission = "perm:"
BuiltinPermissionPrefix Permission = PermissionPrefix + "builtin:"
CustomPermissionPrefix Permission = PermissionPrefix + "custom:"
// constants for builtin permissions
BuiltinPermissionReadOnly Permission = BuiltinPermissionPrefix + "ReadOnly"
BuiltinPermissionWritable Permission = BuiltinPermissionPrefix + "Writable"
// constants for unknown permission
NonePermission Permission = ""
)
var (
permRegexp = regexp.MustCompile(`^perm:((builtin:((.*/*)([^/]*):*)(Writable|ReadOnly))|(custom:(\w)+))$`)
builtinPermRegexp = regexp.MustCompile(`^perm:builtin:((.*/*)([^/]*):*)(Writable|ReadOnly)$`)
builtinWritablePermRegexp = regexp.MustCompile(`^perm:builtin:((.*/*)([^/]*):*)Writable$`)
builtinReadOnlyPermRegexp = regexp.MustCompile(`^perm:builtin:((.*/*)([^/]*):*)ReadOnly$`)
customPermRegexp = regexp.MustCompile(`^perm:custom:(\w)+$`)
subdirRegexp = regexp.MustCompile(`((.*/*)([^/]*)):(Writable|ReadOnly)$`)
)
func ParsePermission(value string) Permission {
if permRegexp.MatchString(value) {
return Permission(value)
}
return NonePermission
}
func NewCustomPermission(name string) Permission {
return Permission(CustomPermissionPrefix + Permission(name))
}
var builtinPermissionActionsMap = map[Permission]Actions{
BuiltinPermissionReadOnly: {
// Object storage interface actions
OSSGetObjectAction,
OSSListObjectsAction,
OSSHeadObjectAction,
OSSHeadBucketAction,
OSSGetObjectTorrentAction,
OSSGetObjectAclAction,
OSSListPartsAction,
OSSGetBucketLocationAction,
OSSGetObjectTaggingAction,
OSSListObjectVersionsAction,
OSSGetObjectLegalHoldAction,
OSSGetObjectRetentionAction,
OSSGetBucketEncryptionAction,
// file system interface
POSIXReadAction,
},
BuiltinPermissionWritable: {
// Object storage interface actions
OSSGetObjectAction,
OSSPutObjectAction,
OSSCopyObjectAction,
OSSListObjectsAction,
OSSDeleteObjectAction,
OSSDeleteObjectsAction,
OSSHeadObjectAction,
OSSHeadBucketAction,
OSSGetObjectTorrentAction,
OSSGetObjectAclAction,
OSSPutObjectAclAction,
OSSCreateMultipartUploadAction,
OSSListMultipartUploadsAction,
OSSUploadPartAction,
OSSUploadPartCopyAction,
OSSListPartsAction,
OSSCompleteMultipartUploadAction,
OSSAbortMultipartUploadAction,
OSSGetBucketLocationAction,
OSSGetObjectTaggingAction,
OSSPutObjectTaggingAction,
OSSDeleteObjectTaggingAction,
OSSListObjectVersionsAction,
OSSGetObjectLegalHoldAction,
OSSPutObjectLegalHoldAction,
OSSGetObjectRetentionAction,
OSSPutObjectRetentionAction,
OSSGetBucketEncryptionAction,
// POSIX file system interface actions
POSIXReadAction,
POSIXWriteAction,
},
}
func BuiltinPermissionActions(perm Permission) Actions {
var p Permission
if builtinWritablePermRegexp.MatchString(string(perm)) {
p = BuiltinPermissionWritable
} else if builtinReadOnlyPermRegexp.MatchString(string(perm)) {
p = BuiltinPermissionReadOnly
}
if actions, exists := builtinPermissionActionsMap[p]; exists {
return actions
}
return nil
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import "strings"
const (
FlowLimit = "f"
QPSLimit = "q"
ConcurrentLimit = "c"
S3Nodes = "s3nodes"
DefaultUid = "default"
)
type UserLimitConf struct {
BandWidthQuota map[string]uint64 `json:"band_width_quota"` // uid --> BytesPS
QPSQuota map[string]uint64 `json:"qps_quota"` // uid --> QPS
ConcurrentQuota map[string]uint64 `json:"concurrent_quota"` // uid --> concurrency
}
type S3QosRequest struct {
Uid string `json:"uid"`
Api string `json:"api"`
Type string `json:"type"`
Quota uint64 `json:"quota"`
Nodes uint64 `json:"nodes"`
}
type S3QoSResponse struct {
ApiLimitConf map[string]*UserLimitConf `json:"user_limit_conf"` // api --> userLimitConf
Nodes uint64 `json:"nodes"`
}
func IsS3PutApi(api string) bool {
switch strings.ToLower(api) {
case "putobject", "copyobject", "uploadpart", "uploadpartcopy", "postobject":
return true
default:
return false
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package proto
import (
"bytes"
"encoding/binary"
"encoding/json"
"errors"
"io"
"strconv"
"strings"
"time"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/log"
)
const (
DefaultTransactionTimeout = 1 // minutes
MaxTransactionTimeout = 60 // minutes
DefaultTxConflictRetryNum = 10
MaxTxConflictRetryNum = 100
DefaultTxConflictRetryInterval = 20 // ms
MaxTxConflictRetryInterval = 1000 // ms
MinTxConflictRetryInterval = 10 // ms
DefaultTxDeleteTime = 120
ClearOrphanTxTime = 3600
)
type TxOpMask uint8
const (
TxOpMaskOff TxOpMask = 0x00
TxOpMaskAll TxOpMask = 0x7F
TxPause TxOpMask = 0xFF
)
const (
TxOpMaskCreate TxOpMask = 0x01 << iota
TxOpMaskMkdir
TxOpMaskRemove
TxOpMaskRename
TxOpMaskMknod
TxOpMaskSymlink
TxOpMaskLink
)
var GTxMaskMap = map[string]TxOpMask{
"off": TxOpMaskOff,
"create": TxOpMaskCreate,
"mkdir": TxOpMaskMkdir,
"remove": TxOpMaskRemove,
"rename": TxOpMaskRename,
"mknod": TxOpMaskMknod,
"symlink": TxOpMaskSymlink,
"link": TxOpMaskLink,
"all": TxOpMaskAll,
}
func GetMaskString(mask TxOpMask) (maskStr string) {
if mask == TxPause {
return "pause"
}
if mask&TxOpMaskAll == TxOpMaskAll {
return "all"
}
for k, v := range GTxMaskMap {
if k == "all" {
continue
}
if mask&v > 0 {
if maskStr == "" {
maskStr = k
} else {
maskStr = maskStr + "|" + k
}
}
}
if maskStr == "" {
maskStr = "off"
}
return
}
func txInvalidMask() (err error) {
return errors.New("transaction mask key value pair should be: enableTxMaskKey=[create|mkdir|remove|rename|mknod|symlink|link]\n enableTxMaskKey=off \n enableTxMaskKey=all")
}
func MaskContains(mask TxOpMask, subMask TxOpMask) bool {
if mask != TxOpMaskOff && subMask == TxOpMaskOff {
return false
}
if (mask | subMask) != mask {
return false
}
return true
}
func GetMaskFromString(maskStr string) (mask TxOpMask, err error) {
if maskStr == "" {
err = txInvalidMask()
return
}
if maskStr == "pause" {
mask = TxPause
return
}
arr := strings.Split(maskStr, "|")
optNum := len(arr)
for _, v := range arr {
if m, ok := GTxMaskMap[v]; ok {
if optNum >= 2 && (m == TxOpMaskOff || m == TxOpMaskAll) {
mask = TxOpMaskOff
err = txInvalidMask()
return
} else {
mask = mask | m
}
} else {
mask = TxOpMaskOff
err = txInvalidMask()
return
}
}
return mask, nil
}
type TxInodeInfo struct {
Ino uint64
MpID uint64
CreateTime int64 // time.Now().Unix()
Timeout int64
TxID string
MpMembers string
}
func NewTxInodeInfo(members string, ino uint64, mpID uint64) *TxInodeInfo {
return &TxInodeInfo{
Ino: ino,
MpID: mpID,
MpMembers: members,
}
}
func (info *TxInodeInfo) String() string {
data, err := json.Marshal(info)
if err != nil {
return ""
}
return string(data)
}
func (info *TxInodeInfo) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0, 128))
if err = binary.Write(buff, binary.BigEndian, &info.Ino); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &info.MpID); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &info.CreateTime); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &info.Timeout); err != nil {
return nil, err
}
id := []byte(info.TxID)
idSize := uint32(len(id))
if err = binary.Write(buff, binary.BigEndian, &idSize); err != nil {
return nil, err
}
if _, err = buff.Write(id); err != nil {
return nil, err
}
addr := []byte(info.MpMembers)
addrSize := uint32(len(addr))
if err = binary.Write(buff, binary.BigEndian, &addrSize); err != nil {
return nil, err
}
if _, err = buff.Write(addr); err != nil {
return nil, err
}
result = buff.Bytes()
return
}
func (info *TxInodeInfo) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &info.Ino); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &info.MpID); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &info.CreateTime); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &info.Timeout); err != nil {
return
}
idSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &idSize); err != nil {
return
}
if idSize > 0 {
id := make([]byte, idSize)
if _, err = io.ReadFull(buff, id); err != nil {
return
}
info.TxID = string(id)
}
addrSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &addrSize); err != nil {
return
}
if addrSize > 0 {
addr := make([]byte, addrSize)
if _, err = io.ReadFull(buff, addr); err != nil {
return
}
info.MpMembers = string(addr)
}
return
}
func (info *TxInodeInfo) GetKey() uint64 {
return info.Ino
}
func (info *TxInodeInfo) SetTxId(txID string) {
info.TxID = txID
}
func (info *TxInodeInfo) SetTimeout(timeout int64) {
info.Timeout = timeout
}
func (info *TxInodeInfo) SetCreateTime(createTime int64) {
info.CreateTime = createTime
}
type TxDentryInfo struct {
ParentId uint64 // FileID value of the parent inode.
Name string // Name of the current dentry.
MpMembers string
TxID string
MpID uint64
CreateTime int64 // time.Now().Unix()
Timeout int64
}
func NewTxDentryInfo(members string, parentId uint64, name string, mpID uint64) *TxDentryInfo {
return &TxDentryInfo{
ParentId: parentId,
Name: name,
MpMembers: members,
MpID: mpID,
}
}
func (info *TxDentryInfo) String() string {
data, err := json.Marshal(info)
if err != nil {
return ""
}
return string(data)
}
func (info *TxDentryInfo) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0, 128))
if err = binary.Write(buff, binary.BigEndian, &info.ParentId); err != nil {
panic(err)
}
name := []byte(info.Name)
nameSize := uint32(len(name))
if err = binary.Write(buff, binary.BigEndian, &nameSize); err != nil {
panic(err)
}
if _, err = buff.Write(name); err != nil {
panic(err)
}
addr := []byte(info.MpMembers)
addrSize := uint32(len(addr))
if err = binary.Write(buff, binary.BigEndian, &addrSize); err != nil {
panic(err)
}
if _, err = buff.Write(addr); err != nil {
panic(err)
}
id := []byte(info.TxID)
idSize := uint32(len(id))
if err = binary.Write(buff, binary.BigEndian, &idSize); err != nil {
panic(err)
}
if _, err = buff.Write(id); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &info.MpID); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &info.CreateTime); err != nil {
panic(err)
}
if err = binary.Write(buff, binary.BigEndian, &info.Timeout); err != nil {
panic(err)
}
result = buff.Bytes()
return
}
func (info *TxDentryInfo) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
if err = binary.Read(buff, binary.BigEndian, &info.ParentId); err != nil {
return
}
nameSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &nameSize); err != nil {
return
}
if nameSize > 0 {
name := make([]byte, nameSize)
if _, err = io.ReadFull(buff, name); err != nil {
return
}
info.Name = string(name)
}
addrSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &addrSize); err != nil {
return
}
if addrSize > 0 {
addr := make([]byte, addrSize)
if _, err = io.ReadFull(buff, addr); err != nil {
return
}
info.MpMembers = string(addr)
}
idSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &idSize); err != nil {
return
}
if idSize > 0 {
id := make([]byte, idSize)
if _, err = io.ReadFull(buff, id); err != nil {
return
}
info.TxID = string(id)
}
if err = binary.Read(buff, binary.BigEndian, &info.MpID); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &info.CreateTime); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &info.Timeout); err != nil {
return
}
return
}
func (info *TxDentryInfo) GetKey() string {
return strconv.FormatUint(info.ParentId, 10) + "_" + info.Name
}
func (info *TxDentryInfo) GetTxId() (string, error) {
if info.TxID == "" {
return "", errors.New("txID is not set")
}
return info.TxID, nil
}
func (info *TxDentryInfo) SetTxId(txID string) {
info.TxID = txID
}
func (info *TxDentryInfo) SetTimeout(timeout int64) {
info.Timeout = timeout
}
func (info *TxDentryInfo) SetCreateTime(createTime int64) {
info.CreateTime = createTime
}
const (
TxTypeUndefined uint32 = iota
TxTypeCreate
TxTypeMkdir
TxTypeRemove
TxTypeRename
TxTypeMknod
TxTypeSymlink
TxTypeLink
)
func TxMaskToType(mask TxOpMask) (txType uint32) {
switch mask {
case TxOpMaskOff:
txType = TxTypeUndefined
case TxOpMaskCreate:
txType = TxTypeCreate
case TxOpMaskMkdir:
txType = TxTypeMkdir
case TxOpMaskRemove:
txType = TxTypeRemove
case TxOpMaskRename:
txType = TxTypeRename
case TxOpMaskMknod:
txType = TxTypeMknod
case TxOpMaskSymlink:
txType = TxTypeSymlink
case TxOpMaskLink:
txType = TxTypeLink
default:
txType = TxTypeUndefined
}
return txType
}
const (
TxStateInit int32 = iota
TxStatePreCommit
TxStateCommit
TxStateRollback
TxStateCommitDone
TxStateRollbackDone
TxStateFailed
)
type TransactionInfo struct {
TxID string // "metapartitionId_atomicId", if empty, mp should be TM, otherwise it will be RM
TxType uint32
TmID int64
CreateTime int64 // time.Now()
Timeout int64 // minutes
State int32
DoneTime int64 // time.now()
RMFinish bool // used to check whether tx success on target rm.
// once insert to txTree, not change inode & dentry ifo
TxInodeInfos map[uint64]*TxInodeInfo
TxDentryInfos map[string]*TxDentryInfo
LastCheckTime int64
}
type TxMpInfo struct {
MpId uint64
Members string
TxInodeInfos map[uint64]*TxInodeInfo
TxDentryInfos map[string]*TxDentryInfo
}
const InitInode = 0
func (tx *TransactionInfo) SetCreateInodeId(ino uint64) {
inoIfo := tx.TxInodeInfos[InitInode]
inoIfo.Ino = ino
delete(tx.TxInodeInfos, InitInode)
tx.TxInodeInfos[ino] = inoIfo
}
func (tx *TransactionInfo) GroupByMp() map[uint64]*TxMpInfo {
txMap := make(map[uint64]*TxMpInfo)
for k, ifo := range tx.TxInodeInfos {
mpIfo, ok := txMap[ifo.MpID]
if !ok {
mpIfo = &TxMpInfo{
MpId: ifo.MpID,
Members: ifo.MpMembers,
TxInodeInfos: make(map[uint64]*TxInodeInfo),
TxDentryInfos: make(map[string]*TxDentryInfo),
}
txMap[ifo.MpID] = mpIfo
}
mpIfo.TxInodeInfos[k] = ifo
}
for k, ifo := range tx.TxDentryInfos {
mpIfo, ok := txMap[ifo.MpID]
if !ok {
mpIfo = &TxMpInfo{
MpId: ifo.MpID,
Members: ifo.MpMembers,
TxInodeInfos: make(map[uint64]*TxInodeInfo),
TxDentryInfos: make(map[string]*TxDentryInfo),
}
txMap[ifo.MpID] = mpIfo
}
mpIfo.TxDentryInfos[k] = ifo
}
return txMap
}
func (tx *TransactionInfo) IsDone() bool {
return tx.State == TxStateCommitDone || tx.State == TxStateRollbackDone
}
func (tx *TransactionInfo) CanDelete() bool {
if !tx.Finish() {
return false
}
if tx.DoneTime+DefaultTxDeleteTime < time.Now().Unix() {
return true
}
return false
}
func (tx *TransactionInfo) NeedClearOrphan() bool {
if tx.Finish() {
return false
}
now := time.Now().Unix()
if tx.CreateTime+ClearOrphanTxTime > now {
return false
}
// try to check every 1 minutes to avoid too many request
if now-tx.LastCheckTime < 60 {
return false
}
tx.LastCheckTime = now
return true
}
func (tx *TransactionInfo) Finish() bool {
return tx.RMFinish
}
func (tx *TransactionInfo) SetFinish() {
tx.RMFinish = true
tx.DoneTime = time.Now().Unix()
}
func (txInfo *TransactionInfo) GetInfo() string {
return txInfo.String()
}
func (txInfo *TransactionInfo) IsExpired() (expired bool) {
now := time.Now().Unix()
expired = txInfo.Timeout*60+txInfo.CreateTime < now
if expired {
log.LogWarnf("IsExpired: transaction [%v] is expired, now[%v], CreateTime[%v]", txInfo, now, txInfo.CreateTime)
}
return expired
}
// Less tests whether the current TransactionInfo item is less than the given one.
// This method is necessary fot B-Tree item implementation.
func (txInfo *TransactionInfo) Less(than btree.Item) bool {
ti, ok := than.(*TransactionInfo)
return ok && txInfo.TxID < ti.TxID
}
// Copy returns a copy of the inode.
func (txInfo *TransactionInfo) Copy() btree.Item {
return txInfo.GetCopy()
}
func NewTxInfoBItem(txId string) *TransactionInfo {
return &TransactionInfo{
TxID: txId,
}
}
const initTmId = -1
func NewTransactionInfo(timeout int64, txType uint32) *TransactionInfo {
return &TransactionInfo{
Timeout: timeout,
TxInodeInfos: make(map[uint64]*TxInodeInfo),
TxDentryInfos: make(map[string]*TxDentryInfo),
TmID: initTmId,
TxType: txType,
State: TxStateInit,
}
}
func (txInfo *TransactionInfo) IsInitialized() bool {
return txInfo.TxID != ""
}
func (txInfo *TransactionInfo) String() string {
data, err := json.Marshal(txInfo)
if err != nil {
return ""
}
return string(data)
}
func (txInfo *TransactionInfo) GetCopy() *TransactionInfo {
newInfo := *txInfo
return &newInfo
}
func (txInfo *TransactionInfo) Marshal() (result []byte, err error) {
buff := bytes.NewBuffer(make([]byte, 0, 256))
id := []byte(txInfo.TxID)
idSize := uint32(len(id))
if err = binary.Write(buff, binary.BigEndian, &idSize); err != nil {
return nil, err
}
if _, err = buff.Write(id); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.TxType); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.TmID); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.CreateTime); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.Timeout); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.State); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.DoneTime); err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, &txInfo.RMFinish); err != nil {
return nil, err
}
inodeNum := uint32(len(txInfo.TxInodeInfos))
if err = binary.Write(buff, binary.BigEndian, &inodeNum); err != nil {
return nil, err
}
for _, txInodeInfo := range txInfo.TxInodeInfos {
bs, err := txInodeInfo.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
}
dentryNum := uint32(len(txInfo.TxDentryInfos))
if err = binary.Write(buff, binary.BigEndian, &dentryNum); err != nil {
panic(err)
}
for _, txDentryInfo := range txInfo.TxDentryInfos {
bs, err := txDentryInfo.Marshal()
if err != nil {
return nil, err
}
if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
return nil, err
}
if _, err := buff.Write(bs); err != nil {
return nil, err
}
}
return buff.Bytes(), nil
}
func (txInfo *TransactionInfo) Unmarshal(raw []byte) (err error) {
buff := bytes.NewBuffer(raw)
idSize := uint32(0)
if err = binary.Read(buff, binary.BigEndian, &idSize); err != nil {
return
}
if idSize > 0 {
id := make([]byte, idSize)
if _, err = io.ReadFull(buff, id); err != nil {
return
}
txInfo.TxID = string(id)
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.TxType); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.TmID); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.CreateTime); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.Timeout); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.State); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.DoneTime); err != nil {
return
}
if err = binary.Read(buff, binary.BigEndian, &txInfo.RMFinish); err != nil {
return
}
var inodeNum uint32
if err = binary.Read(buff, binary.BigEndian, &inodeNum); err != nil {
return
}
var dataLen uint32
txInfo.TxInodeInfos = map[uint64]*TxInodeInfo{}
for i := uint32(0); i < inodeNum; i++ {
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txInodeInfo := NewTxInodeInfo("", 0, 0)
if err = txInodeInfo.Unmarshal(data); err != nil {
return
}
txInfo.TxInodeInfos[txInodeInfo.GetKey()] = txInodeInfo
}
var dentryNum uint32
txInfo.TxDentryInfos = map[string]*TxDentryInfo{}
if err = binary.Read(buff, binary.BigEndian, &dentryNum); err != nil {
return
}
for i := uint32(0); i < dentryNum; i++ {
if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
return
}
data := make([]byte, int(dataLen))
if _, err = buff.Read(data); err != nil {
return
}
txDentryInfo := NewTxDentryInfo("", 0, "", 0)
if err = txDentryInfo.Unmarshal(data); err != nil {
return
}
txInfo.TxDentryInfos[txDentryInfo.GetKey()] = txDentryInfo
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package proto
import (
"fmt"
"regexp"
"sync"
)
var (
AKRegexp = regexp.MustCompile("^[a-zA-Z0-9]{16}$")
SKRegexp = regexp.MustCompile("^[a-zA-Z0-9]{32}$")
WriteS3Api = []string{
"PostObject", "PutObject", "CopyObject", "CreateMultipartUpload", "UploadPart", "UploadPartCopy",
"CompleteMultipartUpload", "AbortMultipartUpload", "DeleteObjects", "DeleteObject",
}
)
type UserType uint8
const (
UserTypeInvalid UserType = 0x0
UserTypeRoot UserType = 0x1
UserTypeAdmin UserType = 0x2
UserTypeNormal UserType = 0x3
)
func (u UserType) Valid() bool {
switch u {
case UserTypeRoot,
UserTypeAdmin,
UserTypeNormal:
return true
default:
}
return false
}
func (u UserType) String() string {
switch u {
case UserTypeRoot:
return "root"
case UserTypeAdmin:
return "admin"
case UserTypeNormal:
return "normal"
default:
}
return "invalid"
}
func UserTypeFromString(name string) UserType {
switch name {
case "root":
return UserTypeRoot
case "admin":
return UserTypeAdmin
case "normal":
return UserTypeNormal
default:
}
return UserTypeInvalid
}
func IsValidAK(ak string) bool {
if AKRegexp.MatchString(ak) {
return true
} else {
return false
}
}
func IsValidSK(sk string) bool {
if SKRegexp.MatchString(sk) {
return true
} else {
return false
}
}
type AKUser struct {
AccessKey string `json:"access_key" graphql:"access_key"`
UserID string `json:"user_id" graphql:"user_id"`
Password string `json:"password" graphql:"password"`
}
type UserInfo struct {
UserID string `json:"user_id" graphql:"user_id"`
AccessKey string `json:"access_key" graphql:"access_key"`
SecretKey string `json:"secret_key" graphql:"secret_key"`
Policy *UserPolicy `json:"policy" graphql:"policy"`
UserType UserType `json:"user_type" graphql:"user_type"`
CreateTime string `json:"create_time" graphql:"create_time"`
Description string `json:"description" graphql:"description"`
Mu sync.RWMutex `json:"-" graphql:"-"`
EMPTY bool // graphql need ???
}
func (i *UserInfo) String() string {
if i == nil {
return "nil"
}
return fmt.Sprintf("%v_%v_%v_%v",
i.UserID, i.AccessKey, i.SecretKey, i.UserType)
}
func NewUserInfo() *UserInfo {
return &UserInfo{Policy: NewUserPolicy()}
}
type VolUser struct {
Vol string `json:"vol"`
UserIDs []string `json:"user_id"`
Mu sync.RWMutex `json:"-" graphql:"-"`
}
type UserPolicy struct {
OwnVols []string `json:"own_vols" graphql:"own_vols"`
AuthorizedVols map[string][]string `json:"authorized_vols" graphql:"-"` // mapping: volume -> actions
mu sync.RWMutex
}
func NewUserPolicy() *UserPolicy {
return &UserPolicy{
OwnVols: make([]string, 0),
AuthorizedVols: make(map[string][]string),
}
}
func (policy *UserPolicy) IsOwn(volume string) bool {
policy.mu.RLock()
defer policy.mu.RUnlock()
for _, vol := range policy.OwnVols {
if vol == volume {
return true
}
}
return false
}
func (policy *UserPolicy) IsAuthorized(volume, subdir string, action Action) bool {
policy.mu.RLock()
defer policy.mu.RUnlock()
if len(policy.OwnVols) > 0 {
for _, v := range policy.OwnVols {
if v == volume {
return true
}
}
}
values, exist := policy.AuthorizedVols[volume]
if !exist {
return false
}
for _, value := range values {
if perm := ParsePermission(value); !perm.IsNone() && perm.IsBuiltin() && perm.MatchSubdir(subdir) && BuiltinPermissionActions(perm).Contains(action) {
return true
}
if act := ParseAction(value); act == action {
return true
}
}
return false
}
func (policy *UserPolicy) IsAuthorizedS3(volume, api string) bool {
policy.mu.RLock()
defer policy.mu.RUnlock()
perms := policy.AuthorizedVols[volume]
for _, perm := range perms {
if builtinWritablePermRegexp.MatchString(perm) {
return true
}
if builtinReadOnlyPermRegexp.MatchString(perm) && !contain(api, WriteS3Api) {
return true
}
}
return false
}
func contain(str string, strs []string) bool {
for _, v := range strs {
if v == str {
return true
}
}
return false
}
func (policy *UserPolicy) AddOwnVol(volume string) {
policy.mu.Lock()
defer policy.mu.Unlock()
for _, ownVol := range policy.OwnVols {
if ownVol == volume {
return
}
}
policy.OwnVols = append(policy.OwnVols, volume)
}
func (policy *UserPolicy) RemoveOwnVol(volume string) {
policy.mu.Lock()
defer policy.mu.Unlock()
for i, ownVol := range policy.OwnVols {
if ownVol == volume {
if i == len(policy.OwnVols)-1 {
policy.OwnVols = policy.OwnVols[:i]
return
}
policy.OwnVols = append(policy.OwnVols[:i], policy.OwnVols[i+1:]...)
return
}
}
}
func (policy *UserPolicy) AddAuthorizedVol(volume string, policies []string) { // todo check duplicate
policy.mu.Lock()
defer policy.mu.Unlock()
newPolicies := make([]string, 0)
for _, policy := range policies {
if perm := ParsePermission(policy); !perm.IsNone() {
newPolicies = append(newPolicies, perm.String())
}
if act := ParseAction(policy); !act.IsNone() {
newPolicies = append(newPolicies, act.String())
}
}
policy.AuthorizedVols[volume] = newPolicies
}
func (policy *UserPolicy) RemoveAuthorizedVol(volume string) {
policy.mu.Lock()
defer policy.mu.Unlock()
delete(policy.AuthorizedVols, volume)
}
func (policy *UserPolicy) SetPerm(volume string, perm Permission) {
policy.mu.Lock()
defer policy.mu.Unlock()
policy.AuthorizedVols[volume] = []string{perm.String()}
}
func (policy *UserPolicy) SetActions(volume string, actions Actions) {
policy.mu.Lock()
defer policy.mu.Unlock()
values := make([]string, actions.Len())
for i, action := range actions {
values[i] = action.String()
}
policy.AuthorizedVols[volume] = values
}
func (policy *UserPolicy) Add(addPolicy *UserPolicy) {
policy.mu.Lock()
defer policy.mu.Unlock()
policy.OwnVols = append(policy.OwnVols, addPolicy.OwnVols...)
for k, v := range addPolicy.AuthorizedVols {
if apis, ok := policy.AuthorizedVols[k]; ok {
policy.AuthorizedVols[k] = append(apis, addPolicy.AuthorizedVols[k]...)
} else {
policy.AuthorizedVols[k] = v
}
}
}
func (policy *UserPolicy) Delete(deletePolicy *UserPolicy) {
policy.mu.Lock()
defer policy.mu.Unlock()
policy.OwnVols = removeSlice(policy.OwnVols, deletePolicy.OwnVols)
for k, v := range deletePolicy.AuthorizedVols {
if apis, ok := policy.AuthorizedVols[k]; ok {
policy.AuthorizedVols[k] = removeSlice(apis, v)
}
}
}
func removeSlice(s []string, removeSlice []string) []string {
if len(s) == 0 {
return s
}
for _, elem := range removeSlice {
for i, v := range s {
if v == elem {
s = append(s[:i], s[i+1:]...)
break
}
}
}
return s
}
func CleanPolicy(policy *UserPolicy) (newUserPolicy *UserPolicy) {
m := make(map[string]bool)
newUserPolicy = NewUserPolicy()
policy.mu.Lock()
defer policy.mu.Unlock()
for _, vol := range policy.OwnVols {
if _, exist := m[vol]; !exist {
m[vol] = true
newUserPolicy.OwnVols = append(newUserPolicy.OwnVols, vol)
}
}
for vol, apis := range policy.AuthorizedVols {
checkMap := make(map[string]bool)
newAPI := make([]string, 0)
for _, api := range apis {
if _, exist := checkMap[api]; !exist {
checkMap[api] = true
newAPI = append(newAPI, api)
}
}
newUserPolicy.AuthorizedVols[vol] = newAPI
}
return
}
type UserCreateParam struct {
ID string `json:"id"`
Password string `json:"pwd"`
AccessKey string `json:"ak"`
SecretKey string `json:"sk"`
Type UserType `json:"type"`
Description string `json:"description"`
}
type UserPermUpdateParam struct {
UserID string `json:"user_id"`
Volume string `json:"volume"`
Subdir string `json:"subdir"`
Policy []string `json:"policy"`
}
func NewUserPermUpdateParam(userID, volmue string) *UserPermUpdateParam {
return &UserPermUpdateParam{UserID: userID, Volume: volmue, Policy: make([]string, 0)}
}
func (param *UserPermUpdateParam) SetPolicy(policy string) {
param.Policy = append(param.Policy, policy)
}
type UserPermRemoveParam struct {
UserID string `json:"user_id"`
Volume string `json:"volume"`
}
func NewUserPermRemoveParam(userID, volmue string) *UserPermRemoveParam {
return &UserPermRemoveParam{UserID: userID, Volume: volmue}
}
type UserTransferVolParam struct {
Volume string `json:"volume"`
UserSrc string `json:"user_src"`
UserDst string `json:"user_dst"`
Force bool `json:"force"`
}
type UserUpdateParam struct {
UserID string `json:"user_id"`
AccessKey string `json:"access_key"`
SecretKey string `json:"secret_key"`
Type UserType `json:"type"`
Password string `json:"password"`
Description string `json:"description"`
}
package proto
import (
"fmt"
"runtime"
)
//TODO: remove this later.
//go:generate golangci-lint run --issues-exit-code=1 -D errcheck -E bodyclose .
var (
Version string
CommitID string
BranchName string
BuildTime string
)
func DumpVersion(role string) string {
return fmt.Sprintf("CubeFS %s\n"+
"Version : %s\n"+
"Branch : %s\n"+
"Commit : %s\n"+
"Build : %s %s %s %s\n",
role,
Version,
BranchName,
CommitID,
runtime.Version(), runtime.GOOS, runtime.GOARCH, BuildTime)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package raftstore
import (
"fmt"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
// Constants for network port definition.
const (
DefaultHeartbeatPort = 5901
DefaultReplicaPort = 5902
DefaultNumOfLogsToRetain = 20000
DefaultTickInterval = 300
DefaultElectionTick = 3
)
// Config defines the configuration properties for the raft store.
type Config struct {
NodeID uint64 // Identity of raft server instance.
RaftPath string // Path of raft logs
IPAddr string // IP address
HeartbeatPort int
ReplicaPort int
NumOfLogsToRetain uint64 // number of logs to be kept after truncation. The default value is 20000.
// TickInterval is the interval of timer which check heartbeat and election timeout.
// The default value is 300,unit is millisecond.
TickInterval int
// RecvBufSize is the size of raft receive buffer channel.
// The default value is 2048.
RecvBufSize int
// ElectionTick is the election timeout. If a follower does not receive any message
// from the leader of current term during ElectionTick, it will become candidate and start an election.
// ElectionTick must be greater than HeartbeatTick.
// We suggest to use ElectionTick = 10 * HeartbeatTick to avoid unnecessary leader switching.
// The default value is 1s.
ElectionTick int
}
// PeerAddress defines the set of addresses that will be used by the peers.
type PeerAddress struct {
proto.Peer
Address string
HeartbeatPort int
ReplicaPort int
}
// PartitionConfig defines the configuration properties for the partitions.
type PartitionConfig struct {
ID uint64
Applied uint64
Leader uint64
Term uint64
Peers []PeerAddress
SM PartitionFsm
WalPath string
}
func (p PeerAddress) String() string {
return fmt.Sprintf(`"nodeID":"%v","peerID":"%v","priority":"%v","type":"%v","heartbeatPort":"%v","ReplicaPort":"%v"`,
p.ID, p.PeerID, p.Priority, p.Type.String(), p.HeartbeatPort, p.ReplicaPort)
}
package raftstore
import (
"fmt"
"sync"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
const (
defaultReportDuration = time.Minute * 3
defaultZombieThreshold = time.Minute * 3
defaultNoLeaderThreshold = time.Second * 30
)
const (
cfgZombieThresholdSec = "raftMonZombieThrSec"
cfgZombieTooLongThresholdSec = "raftMonZombieTooLongThrSec"
cfgNoLeaderThresholdSec = "raftMonNoLeaderThrSec"
cfgNoLeaderTooLongThresholdSec = "raftMonNoLeaderTooLongThrSec"
)
type monitorConf struct {
ZombieThreshold time.Duration
ZombieTooLongThreshold time.Duration
NoLeaderThreshold time.Duration
NoLeaderTooLongThreshold time.Duration
}
var gMonConf = monitorConf{
ZombieThreshold: defaultZombieThreshold,
ZombieTooLongThreshold: defaultReportDuration,
NoLeaderThreshold: defaultNoLeaderThreshold,
NoLeaderTooLongThreshold: defaultReportDuration,
}
func setMonitorConf(cfg *config.Config) {
if cfg == nil {
return
}
cfgZomThr := cfg.GetInt64(cfgZombieThresholdSec)
if cfgZomThr > 0 {
gMonConf.ZombieThreshold = time.Second * time.Duration(cfgZomThr)
}
cfgZomTooLongThr := cfg.GetInt64(cfgZombieTooLongThresholdSec)
if cfgZomTooLongThr > 0 {
gMonConf.ZombieTooLongThreshold = time.Second * time.Duration(cfgZomTooLongThr)
}
cfgNoLeaderThr := cfg.GetInt64(cfgNoLeaderThresholdSec)
if cfgNoLeaderThr > 0 {
gMonConf.NoLeaderThreshold = time.Second * time.Duration(cfgNoLeaderThr)
}
cfgNoLeaderTooLongThr := cfg.GetInt64(cfgNoLeaderTooLongThresholdSec)
if cfgNoLeaderTooLongThr > 0 {
gMonConf.NoLeaderTooLongThreshold = time.Second * time.Duration(cfgNoLeaderTooLongThr)
}
log.LogInfof("set raft monitor cfg: zombieThreshold:[%v], zombieTooLongThreshold:[%v],"+
" noLeaderThreshold:[%v], noLeaderTooLongThreshold:[%v]",
gMonConf.ZombieThreshold, gMonConf.ZombieTooLongThreshold,
gMonConf.NoLeaderThreshold, gMonConf.NoLeaderTooLongThreshold)
}
type zombiePeer struct {
partitionID uint64
peer proto.Peer
}
type monitor struct {
zombieDurations map[zombiePeer]time.Duration
zombieDurationMutex sync.RWMutex
noLeaderDurations map[uint64]time.Duration
noLeaderDurationsMutex sync.RWMutex
}
func newMonitor() *monitor {
var m *monitor
m = &monitor{}
m.zombieDurations = make(map[zombiePeer]time.Duration)
m.noLeaderDurations = make(map[uint64]time.Duration)
return m
}
func (d *monitor) MonitorZombie(id uint64, peer proto.Peer, replicasMsg string, du time.Duration) {
if du < gMonConf.ZombieThreshold {
return
}
needReport := true
var errMsg string
zombiePeer := zombiePeer{
partitionID: id,
peer: peer,
}
d.zombieDurationMutex.RLock()
oldDu := d.zombieDurations[zombiePeer]
d.zombieDurationMutex.RUnlock()
if oldDu == 0 || du < oldDu {
// peer became zombie recently
errMsg = fmt.Sprintf("[MonitorZombie] raft peer zombie, "+
"partitionID[%d] replicaID[%v] replicasMsg[%s] zombiePeer[%v] zombieDuration[%v]",
id, peer.PeerID, replicasMsg, peer, du)
} else if du-oldDu > gMonConf.ZombieTooLongThreshold {
// peer keeping zombie for too long
errMsg = fmt.Sprintf("[MonitorZombieTooLong] raft peer zombie too long, "+
"partitionID[%d] replicaID[%v] replicasMsg[%s] zombiePeer[%v] zombieDuration[%v]",
id, peer.PeerID, replicasMsg, peer, du)
} else {
// peer keeping zombie, but it's not time for another too-long-report yet
needReport = false
}
if !needReport {
return
}
d.zombieDurationMutex.Lock()
d.zombieDurations[zombiePeer] = du
d.zombieDurationMutex.Unlock()
log.LogError(errMsg)
exporter.Warning(errMsg)
}
func (d *monitor) MonitorElection(id uint64, replicaMsg string, du time.Duration) {
if du < gMonConf.NoLeaderThreshold {
return
}
needReport := true
var errMsg string
d.noLeaderDurationsMutex.RLock()
oldDu := d.noLeaderDurations[id]
d.noLeaderDurationsMutex.RUnlock()
if oldDu == 0 || du < oldDu {
// became no leader recently
errMsg = fmt.Sprintf("[RaftNoLeader] raft no leader partitionID[%d]_replicas[%v]_Duration[%v]",
id, replicaMsg, du)
} else if du-oldDu > gMonConf.NoLeaderTooLongThreshold {
// keeping no leader for too long
errMsg = fmt.Sprintf("[RaftNoLeaderTooLong] raft no leader too long, "+
"partitionID[%d]_replicas[%v]_Duration[%v]",
id, replicaMsg, du)
} else {
// keeping not health, but it's not time for another too-long-report yet
needReport = false
}
if !needReport {
return
}
d.noLeaderDurationsMutex.Lock()
d.noLeaderDurations[id] = du
d.noLeaderDurationsMutex.Unlock()
log.LogError(errMsg)
exporter.Warning(errMsg)
}
func (d *monitor) RemovePeer(id uint64, p proto.Peer) {
zp := zombiePeer{
partitionID: id,
peer: p,
}
d.zombieDurationMutex.Lock()
_, present := d.zombieDurations[zp]
if present {
delete(d.zombieDurations, zp)
log.LogInfof("remove peer from raft monitor, partitionID: %v, peer: %v", id, p)
}
d.zombieDurationMutex.Unlock()
}
func (d *monitor) RemovePartition(id uint64, peers []proto.Peer) {
d.noLeaderDurationsMutex.Lock()
_, present := d.noLeaderDurations[id]
if present {
delete(d.noLeaderDurations, id)
log.LogInfof("remove partition from raft monitor, partitionID: %v, peers: %v", id, peers)
}
d.noLeaderDurationsMutex.Unlock()
for _, p := range peers {
d.RemovePeer(id, p)
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package raftstore
import (
"os"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)
// PartitionStatus is a type alias of raft.Status
type PartitionStatus = raft.Status
// PartitionFsm wraps necessary methods include both FSM implementation
// and data storage operation for raft store partition.
// It extends from raft StateMachine and Store.
type PartitionFsm = raft.StateMachine
// Partition wraps necessary methods for raft store partition operation.
// Partition is a shard for multi-raft in RaftSore. RaftStore is based on multi-raft which
// manages multiple raft replication groups at same time through a single
// raft server instance and system resource.
type Partition interface {
// Submit submits command data to raft log.
Submit(cmd []byte) (resp interface{}, err error)
// ChangeMember submits member change event and information to raft log.
ChangeMember(changeType proto.ConfChangeType, peer proto.Peer, context []byte) (resp interface{}, err error)
// Stop removes the raft partition from raft server and shuts down this partition.
Stop() error
// Delete stops and deletes the partition.
Delete() error
// Status returns the current raft status.
Status() (status *PartitionStatus)
// IsRestoring Much faster then status().RestoringSnapshot.
IsRestoring() bool
// LeaderTerm returns the current term of leader in the raft group. TODO what is term?
LeaderTerm() (leaderID, term uint64)
// IsRaftLeader returns true if this node is the leader of the raft group it belongs to.
IsRaftLeader() bool
// AppliedIndex returns the current index of the applied raft log in the raft store partition.
AppliedIndex() uint64
// CommittedIndex returns the current index of the applied raft log in the raft store partition.
CommittedIndex() uint64
// Truncate raft log
Truncate(index uint64)
TryToLeader(nodeID uint64) error
IsOfflinePeer() bool
}
// Default implementation of the Partition interface.
type partition struct {
id uint64
raft *raft.RaftServer
walPath string
config *PartitionConfig
}
// ChangeMember submits member change event and information to raft log.
func (p *partition) ChangeMember(changeType proto.ConfChangeType, peer proto.Peer, context []byte) (
resp interface{}, err error) {
if !p.IsRaftLeader() {
err = raft.ErrNotLeader
return
}
future := p.raft.ChangeMember(p.id, changeType, peer, context)
resp, err = future.Response()
return
}
// Stop removes the raft partition from raft server and shuts down this partition.
func (p *partition) Stop() (err error) {
err = p.raft.RemoveRaft(p.id)
return
}
func (p *partition) TryToLeader(nodeID uint64) (err error) {
future := p.raft.TryToLeader(nodeID)
_, err = future.Response()
return
}
// Delete stops and deletes the partition.
func (p *partition) Delete() (err error) {
if err = p.Stop(); err != nil {
return
}
err = os.RemoveAll(p.walPath)
return
}
func (p *partition) IsRestoring() bool {
return p.raft.IsRestoring(p.id)
}
// Status returns the current raft status.
func (p *partition) Status() (status *PartitionStatus) {
status = p.raft.Status(p.id)
return
}
// LeaderTerm returns the current term of leader in the raft group.
func (p *partition) LeaderTerm() (leaderID, term uint64) {
if p.raft == nil {
return
}
leaderID, term = p.raft.LeaderTerm(p.id)
return
}
func (p *partition) IsOfflinePeer() bool {
status := p.Status()
active := 0
sumPeers := 0
for _, peer := range status.Replicas {
if peer.Active {
active++
}
sumPeers++
}
return active >= (int(sumPeers)/2 + 1)
}
// IsRaftLeader returns true if this node is the leader of the raft group it belongs to.
func (p *partition) IsRaftLeader() (isLeader bool) {
isLeader = p.raft != nil && p.raft.IsLeader(p.id)
return
}
// AppliedIndex returns the current index of the applied raft log in the raft store partition.
func (p *partition) AppliedIndex() (applied uint64) {
applied = p.raft.AppliedIndex(p.id)
return
}
// CommittedIndex returns the current index of the applied raft log in the raft store partition.
func (p *partition) CommittedIndex() (applied uint64) {
applied = p.raft.CommittedIndex(p.id)
return
}
// Submit submits command data to raft log.
func (p *partition) Submit(cmd []byte) (resp interface{}, err error) {
if !p.IsRaftLeader() {
err = raft.ErrNotLeader
return
}
future := p.raft.Submit(p.id, cmd)
resp, err = future.Response()
return
}
// Truncate truncates the raft log
func (p *partition) Truncate(index uint64) {
if p.raft != nil {
p.raft.Truncate(p.id, index)
}
}
func newPartition(cfg *PartitionConfig, raft *raft.RaftServer, walPath string) Partition {
return &partition{
id: cfg.ID,
raft: raft,
walPath: walPath,
config: cfg,
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package raftstore
import (
"fmt"
syslog "log"
"os"
"path"
"strconv"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
"github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
"github.com/cubefs/cubefs/depends/tiglabs/raft/storage/wal"
raftlog "github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
utilConfig "github.com/cubefs/cubefs/util/config"
)
// RaftStore defines the interface for the raft store.
type RaftStore interface {
CreatePartition(cfg *PartitionConfig) (Partition, error)
Stop()
RaftConfig() *raft.Config
RaftStatus(raftID uint64) (raftStatus *raft.Status)
NodeManager
RaftServer() *raft.RaftServer
}
type raftStore struct {
nodeID uint64
resolver NodeResolver
raftConfig *raft.Config
raftServer *raft.RaftServer
raftPath string
}
// RaftConfig returns the raft configuration.
func (s *raftStore) RaftConfig() *raft.Config {
return s.raftConfig
}
func (s *raftStore) RaftStatus(raftID uint64) (raftStatus *raft.Status) {
return s.raftServer.Status(raftID)
}
// AddNodeWithPort add a new node with the given port.
func (s *raftStore) AddNodeWithPort(nodeID uint64, addr string, heartbeat int, replicate int) {
s.resolver.AddNodeWithPort(nodeID, addr, heartbeat, replicate)
}
// DeleteNode deletes the node with the given ID in the raft store.
func (s *raftStore) DeleteNode(nodeID uint64) {
s.resolver.DeleteNode(nodeID)
}
// Stop stops the raft store server.
func (s *raftStore) Stop() {
if s.raftServer != nil {
s.raftServer.Stop()
}
}
func newRaftLogger(dir string) {
raftLogPath := path.Join(dir, "logs")
_, err := os.Stat(raftLogPath)
if err != nil {
if pathErr, ok := err.(*os.PathError); ok {
if os.IsNotExist(pathErr) {
os.MkdirAll(raftLogPath, 0o755)
}
}
}
raftLog, err := raftlog.NewLog(raftLogPath, "raft", "debug")
if err != nil {
syslog.Println("Fatal: failed to start the baud storage daemon - ", err)
return
}
logger.SetLogger(raftLog)
return
}
// NewRaftStore returns a new raft store instance.
func NewRaftStore(cfg *Config, extendCfg *utilConfig.Config) (mr RaftStore, err error) {
resolver := NewNodeResolver()
newRaftLogger(cfg.RaftPath)
setMonitorConf(extendCfg)
rc := raft.DefaultConfig()
rc.NodeID = cfg.NodeID
rc.LeaseCheck = true
rc.PreVote = true
if cfg.HeartbeatPort <= 0 {
cfg.HeartbeatPort = DefaultHeartbeatPort
}
if cfg.ReplicaPort <= 0 {
cfg.ReplicaPort = DefaultReplicaPort
}
if cfg.NumOfLogsToRetain == 0 {
cfg.NumOfLogsToRetain = DefaultNumOfLogsToRetain
}
if cfg.ElectionTick < DefaultElectionTick {
cfg.ElectionTick = DefaultElectionTick
}
if cfg.TickInterval < DefaultTickInterval {
cfg.TickInterval = DefaultTickInterval
}
// if cfg's RecvBufSize bigger than the default 2048,
// use the bigger one.
if cfg.RecvBufSize > rc.ReqBufferSize {
rc.ReqBufferSize = cfg.RecvBufSize
}
rc.HeartbeatAddr = fmt.Sprintf("%s:%d", cfg.IPAddr, cfg.HeartbeatPort)
rc.ReplicateAddr = fmt.Sprintf("%s:%d", cfg.IPAddr, cfg.ReplicaPort)
rc.Resolver = resolver
rc.RetainLogs = cfg.NumOfLogsToRetain
rc.TickInterval = time.Duration(cfg.TickInterval) * time.Millisecond
rc.ElectionTick = cfg.ElectionTick
rs, err := raft.NewRaftServer(rc)
if err != nil {
return
}
mr = &raftStore{
nodeID: cfg.NodeID,
resolver: resolver,
raftConfig: rc,
raftServer: rs,
raftPath: cfg.RaftPath,
}
return
}
func (s *raftStore) RaftServer() *raft.RaftServer {
return s.raftServer
}
// CreatePartition creates a new partition in the raft store.
func (s *raftStore) CreatePartition(cfg *PartitionConfig) (p Partition, err error) {
// Init WaL Storage for this partition.
// Variables:
// wc: WaL Configuration.
// wp: WaL Path.
// ws: WaL Storage.
var walPath string
if cfg.WalPath == "" {
walPath = path.Join(s.raftPath, strconv.FormatUint(cfg.ID, 10))
} else {
walPath = path.Join(cfg.WalPath, "wal_"+strconv.FormatUint(cfg.ID, 10))
}
wc := &wal.Config{}
ws, err := wal.NewStorage(walPath, wc)
if err != nil {
return
}
peers := make([]proto.Peer, 0)
for _, peerAddress := range cfg.Peers {
peers = append(peers, peerAddress.Peer)
s.AddNodeWithPort(
peerAddress.ID,
peerAddress.Address,
peerAddress.HeartbeatPort,
peerAddress.ReplicaPort,
)
}
logger.Info("action[raftstore:CreatePartition] raft config applied [%v] id:%d", cfg.Applied, cfg.ID)
rc := &raft.RaftConfig{
ID: cfg.ID,
Peers: peers,
Leader: cfg.Leader,
Term: cfg.Term,
Storage: ws,
StateMachine: cfg.SM,
Applied: cfg.Applied,
Monitor: newMonitor(),
}
if err = s.raftServer.CreateRaft(rc); err != nil {
return
}
p = newPartition(cfg, s.raftServer, walPath)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package raftstore_db
import (
"fmt"
"os"
"strings"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/fileutil"
"github.com/cubefs/cubefs/util/log"
"github.com/tecbot/gorocksdb"
)
// RocksDBStore is a wrapper of the gorocksdb.DB
type RocksDBStore struct {
dir string
lruCacheSize int
writeBufferSize int
db *gorocksdb.DB
}
func (rs *RocksDBStore) GetLruCacheSize() int {
return rs.lruCacheSize
}
func (rs *RocksDBStore) GetWriteBufferSize() int {
return rs.writeBufferSize
}
func (rs *RocksDBStore) GetDir() string {
return rs.dir
}
// NewRocksDBStore returns a new RocksDB instance.
func NewRocksDBStore(dir string, lruCacheSize, writeBufferSize int) (store *RocksDBStore, err error) {
if err = os.MkdirAll(dir, os.ModePerm); err != nil {
return
}
store = &RocksDBStore{
dir: dir,
lruCacheSize: lruCacheSize,
writeBufferSize: writeBufferSize,
}
if err = store.Open(); err != nil {
return
}
return
}
func GetRocksDBStoreRecoveryDir(dir string) string {
dir = strings.TrimSuffix(dir, "/")
return fmt.Sprintf("%v_temp", dir)
}
// NewRocksDBStoreAndRecovery returns a new RocksDB instance after execute recovery.
func NewRocksDBStoreAndRecovery(dir string, lruCacheSize, writeBufferSize int) (store *RocksDBStore, err error) {
// start recovery
recoverDir := GetRocksDBStoreRecoveryDir(dir)
// if rocksdb dir is not exists but temp dir is exist
if !fileutil.ExistDir(dir) && fileutil.ExistDir(recoverDir) {
// we move temp dir to rocksdb dir for commiting transaction
if err = os.Rename(recoverDir, dir); err != nil {
log.LogErrorf("action[NewRocksDBStoreAndRecovery]failed to rename rocksdb recovery dir %v", err.Error())
return
}
log.LogDebug("action[NewRocksDBStoreAndRecovery]recovery rocksdb success")
} else if err = os.MkdirAll(dir, os.ModePerm); err != nil {
return
}
store = &RocksDBStore{
dir: dir,
lruCacheSize: lruCacheSize,
writeBufferSize: writeBufferSize,
}
if err = store.Open(); err != nil {
return
}
return
}
// Open opens the RocksDB instance.
func (rs *RocksDBStore) Open() error {
basedTableOptions := gorocksdb.NewDefaultBlockBasedTableOptions()
basedTableOptions.SetBlockCache(gorocksdb.NewLRUCache(uint64(rs.lruCacheSize)))
opts := gorocksdb.NewDefaultOptions()
opts.SetBlockBasedTableFactory(basedTableOptions)
opts.SetCreateIfMissing(true)
opts.SetWriteBufferSize(rs.writeBufferSize)
opts.SetMaxWriteBufferNumber(2)
opts.SetCompression(gorocksdb.NoCompression)
db, err := gorocksdb.OpenDb(opts, rs.dir)
if err != nil {
err = fmt.Errorf("action[openRocksDB],err:%v", err)
return err
}
rs.db = db
return nil
}
func (rs *RocksDBStore) Close() {
rs.db.Close()
}
// Del deletes a key-value pair.
func (rs *RocksDBStore) Del(key interface{}, isSync bool) (result interface{}, err error) {
ro := gorocksdb.NewDefaultReadOptions()
wo := gorocksdb.NewDefaultWriteOptions()
wb := gorocksdb.NewWriteBatch()
wo.SetSync(isSync)
defer func() {
wo.Destroy()
ro.Destroy()
wb.Destroy()
}()
slice, err := rs.db.Get(ro, []byte(key.(string)))
if err != nil {
return
}
result = slice.Data()
err = rs.db.Delete(wo, []byte(key.(string)))
return
}
// Put adds a new key-value pair to the RocksDB.
func (rs *RocksDBStore) Put(key, value interface{}, isSync bool) (result interface{}, err error) {
wo := gorocksdb.NewDefaultWriteOptions()
wb := gorocksdb.NewWriteBatch()
wo.SetSync(isSync)
defer func() {
wo.Destroy()
wb.Destroy()
}()
wb.Put([]byte(key.(string)), value.([]byte))
if err := rs.db.Write(wo, wb); err != nil {
return nil, err
}
result = value
return result, nil
}
func (rs *RocksDBStore) Flush() (err error) {
fo := gorocksdb.NewDefaultFlushOptions()
return rs.db.Flush(fo)
}
// Get returns the value based on the given key.
func (rs *RocksDBStore) Get(key interface{}) (result interface{}, err error) {
ro := gorocksdb.NewDefaultReadOptions()
ro.SetFillCache(false)
defer ro.Destroy()
return rs.db.GetBytes(ro, []byte(key.(string)))
}
// DeleteKeyAndPutIndex deletes the key-value pair based on the given key and put other keys in the cmdMap to RocksDB.
// TODO explain
func (rs *RocksDBStore) DeleteKeyAndPutIndex(key string, cmdMap map[string][]byte, isSync bool) error {
wo := gorocksdb.NewDefaultWriteOptions()
wo.SetSync(isSync)
wb := gorocksdb.NewWriteBatch()
defer func() {
wo.Destroy()
wb.Destroy()
}()
wb.Delete([]byte(key))
for otherKey, value := range cmdMap {
if otherKey == key {
continue
}
wb.Put([]byte(otherKey), value)
}
if err := rs.db.Write(wo, wb); err != nil {
err = fmt.Errorf("action[deleteFromRocksDB],err:%v", err)
return err
}
return nil
}
// Put adds a new key-value pair to the RocksDB.
func (rs *RocksDBStore) Replace(key string, value interface{}, isSync bool) (result interface{}, err error) {
wo := gorocksdb.NewDefaultWriteOptions()
wb := gorocksdb.NewWriteBatch()
wo.SetSync(isSync)
defer func() {
wo.Destroy()
wb.Destroy()
}()
wb.Delete([]byte(key))
wb.Put([]byte(key), value.([]byte))
if err := rs.db.Write(wo, wb); err != nil {
return nil, err
}
result = value
return result, nil
}
// BatchDeleteAndPut delete the keys in set and put the kvs in batch
func (rs *RocksDBStore) BatchDeleteAndPut(deleteSet map[string]util.Null, cmdMap map[string][]byte, isSync bool) error {
wo := gorocksdb.NewDefaultWriteOptions()
wo.SetSync(isSync)
wb := gorocksdb.NewWriteBatch()
defer func() {
wo.Destroy()
wb.Destroy()
}()
for key := range deleteSet {
wb.Delete([]byte(key))
}
for key, value := range cmdMap {
// NOTE: skip if the key in delete set
if deleteSet != nil {
_, ok := deleteSet[key]
if ok {
continue
}
}
wb.Put([]byte(key), value)
}
if err := rs.db.Write(wo, wb); err != nil {
err = fmt.Errorf("action[batchPutToRocksDB],err:%v", err)
return err
}
return nil
}
// BatchPut puts the key-value pairs in batch.
func (rs *RocksDBStore) BatchPut(cmdMap map[string][]byte, isSync bool) error {
return rs.BatchDeleteAndPut(nil, cmdMap, isSync)
}
// SeekForPrefix seeks for the place where the prefix is located in the snapshots.
func (rs *RocksDBStore) SeekForPrefix(prefix []byte) (result map[string][]byte, err error) {
result = make(map[string][]byte)
snapshot := rs.RocksDBSnapshot()
it := rs.Iterator(snapshot)
defer func() {
it.Close()
rs.ReleaseSnapshot(snapshot)
}()
it.Seek(prefix)
for ; it.ValidForPrefix(prefix); it.Next() {
key := it.Key().Data()
value := it.Value().Data()
valueByte := make([]byte, len(value))
copy(valueByte, value)
result[string(key)] = valueByte
it.Key().Free()
it.Value().Free()
}
if err := it.Err(); err != nil {
return nil, err
}
return result, nil
}
// RocksDBSnapshot returns the RocksDB snapshot.
func (rs *RocksDBStore) RocksDBSnapshot() *gorocksdb.Snapshot {
return rs.db.NewSnapshot()
}
// ReleaseSnapshot releases the snapshot and its resources.
func (rs *RocksDBStore) ReleaseSnapshot(snapshot *gorocksdb.Snapshot) {
rs.db.ReleaseSnapshot(snapshot)
}
// Iterator returns the iterator of the snapshot.
func (rs *RocksDBStore) Iterator(snapshot *gorocksdb.Snapshot) *gorocksdb.Iterator {
ro := gorocksdb.NewDefaultReadOptions()
ro.SetFillCache(false)
ro.SetSnapshot(snapshot)
return rs.db.NewIterator(ro)
}
func (rs *RocksDBStore) Clear() (err error) {
wo := gorocksdb.NewDefaultWriteOptions()
wo.SetSync(true)
wb := gorocksdb.NewWriteBatch()
defer func() {
wo.Destroy()
wb.Destroy()
}()
// NOTE: 0 - 255 include all keys
wb.DeleteRange([]byte{0}, []byte{255})
err = rs.db.Write(wo, wb)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package raftstore
import (
"fmt"
"strings"
"sync"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/util/errors"
)
// Error definitions.
var (
ErrNoSuchNode = errors.New("no such node")
ErrIllegalAddress = errors.New("illegal address")
ErrUnknownSocketType = errors.New("unknown socket type")
)
// This private struct defines the necessary properties for node address info.
type nodeAddress struct {
Heartbeat string
Replicate string
}
// NodeManager defines the necessary methods for node address management.
type NodeManager interface {
// add node address with specified port.
AddNodeWithPort(nodeID uint64, addr string, heartbeat int, replicate int)
// delete node address information
DeleteNode(nodeID uint64)
}
// NodeResolver defines the methods for node address resolving and management.
// It is extended from SocketResolver and NodeManager.
type NodeResolver interface {
raft.SocketResolver
NodeManager
}
// Default thread-safe implementation of the NodeResolver interface.
type nodeResolver struct {
nodeMap sync.Map
}
// NodeAddress resolves NodeID as net.Addr.
// This method is necessary for SocketResolver interface implementation.
func (r *nodeResolver) NodeAddress(nodeID uint64, stype raft.SocketType) (addr string, err error) {
val, ok := r.nodeMap.Load(nodeID)
if !ok {
err = ErrNoSuchNode
return
}
address, ok := val.(*nodeAddress)
if !ok {
err = ErrIllegalAddress
return
}
switch stype {
case raft.HeartBeat:
addr = address.Heartbeat
case raft.Replicate:
addr = address.Replicate
default:
err = ErrUnknownSocketType
}
return
}
// AddNode adds node address information.
func (r *nodeResolver) AddNode(nodeID uint64, addr string) {
r.AddNodeWithPort(nodeID, addr, 0, 0)
}
// AddNodeWithPort adds node address with specified port.
func (r *nodeResolver) AddNodeWithPort(nodeID uint64, addr string, heartbeat int, replicate int) {
if heartbeat == 0 {
heartbeat = DefaultHeartbeatPort
}
if replicate == 0 {
replicate = DefaultReplicaPort
}
if len(strings.TrimSpace(addr)) != 0 {
r.nodeMap.Store(nodeID, &nodeAddress{
Heartbeat: fmt.Sprintf("%s:%d", addr, heartbeat),
Replicate: fmt.Sprintf("%s:%d", addr, replicate),
})
}
}
// DeleteNode deletes the node address information of the specified node ID from the NodeManager if possible.
func (r *nodeResolver) DeleteNode(nodeID uint64) {
r.nodeMap.Delete(nodeID)
}
// NewNodeResolver returns a new NodeResolver instance for node address management and resolving.
func NewNodeResolver() NodeResolver {
return &nodeResolver{}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package repl
import (
"fmt"
"io"
"net"
"strings"
"time"
"github.com/cubefs/cubefs/depends/tiglabs/raft"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
var (
ErrBadNodes = errors.New("BadNodesErr")
ErrArgLenMismatch = errors.New("ArgLenMismatchErr")
)
type Packet struct {
proto.Packet
followersAddrs []string
followerPackets []*FollowerPacket
IsReleased int32 // TODO what is released?
Object interface{}
TpObject *exporter.TimePointCount
NeedReply bool
OrgBuffer []byte
// used locally
shallDegrade bool
AfterPre bool
}
type FollowerPacket struct {
proto.Packet
respCh chan error
}
func NewFollowerPacket() (fp *FollowerPacket) {
fp = new(FollowerPacket)
fp.respCh = make(chan error, 1)
fp.StartT = time.Now().UnixNano()
return fp
}
func (p *FollowerPacket) PackErrorBody(action, msg string) {
p.identificationErrorResultCode(action, msg)
p.Size = uint32(len([]byte(action + "_" + msg)))
p.Data = make([]byte, p.Size)
copy(p.Data[:int(p.Size)], []byte(action+"_"+msg))
}
func (p *FollowerPacket) IsErrPacket() bool {
return p.ResultCode != proto.OpOk && p.ResultCode != proto.OpInitResultCode
}
func (p *FollowerPacket) identificationErrorResultCode(errLog string, errMsg string) {
if strings.Contains(errLog, ActionReceiveFromFollower) || strings.Contains(errLog, ActionSendToFollowers) ||
strings.Contains(errLog, ConnIsNullErr) {
p.ResultCode = proto.OpIntraGroupNetErr
log.LogErrorf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
} else if strings.Contains(errMsg, storage.ParameterMismatchError.Error()) ||
strings.Contains(errMsg, ErrorUnknownOp.Error()) {
p.ResultCode = proto.OpArgMismatchErr
} else if strings.Contains(errMsg, proto.ErrDataPartitionNotExists.Error()) {
p.ResultCode = proto.OpTryOtherAddr
} else if strings.Contains(errMsg, storage.ExtentNotFoundError.Error()) ||
strings.Contains(errMsg, storage.ExtentHasBeenDeletedError.Error()) {
p.ResultCode = proto.OpNotExistErr
} else if strings.Contains(errMsg, storage.NoSpaceError.Error()) {
p.ResultCode = proto.OpDiskNoSpaceErr
} else if strings.Contains(errMsg, storage.TryAgainError.Error()) {
p.ResultCode = proto.OpAgain
} else if strings.Contains(errMsg, raft.ErrNotLeader.Error()) {
p.ResultCode = proto.OpTryOtherAddr
} else if strings.Contains(errMsg, raft.ErrStopped.Error()) {
p.ResultCode = proto.OpTryOtherAddr
} else {
log.LogErrorf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
p.ResultCode = proto.OpIntraGroupNetErr
}
}
func (p *Packet) AfterTp() (ok bool) {
if p.TpObject != nil {
p.TpObject.Set(nil)
}
return
}
func (p *Packet) clean() {
if p.Data == nil && p.OrgBuffer == nil {
return
}
p.Object = nil
p.TpObject = nil
p.Data = nil
p.Arg = nil
if p.OrgBuffer != nil && len(p.OrgBuffer) == util.BlockSize && p.IsNormalWriteOperation() {
proto.Buffers.Put(p.OrgBuffer)
p.OrgBuffer = nil
}
}
func copyPacket(src *Packet, dst *FollowerPacket) {
dst.Magic = src.Magic
dst.ExtentType = src.ExtentType
dst.Opcode = src.Opcode
dst.ResultCode = src.ResultCode
dst.CRC = src.CRC
dst.Size = src.Size
dst.KernelOffset = src.KernelOffset
dst.PartitionID = src.PartitionID
dst.ExtentID = src.ExtentID
dst.ExtentOffset = src.ExtentOffset
dst.ReqID = src.ReqID
dst.Data = src.OrgBuffer
}
func (p *Packet) BeforeTp(clusterID string) (ok bool) {
if p.IsForwardPkt() && !p.IsRandomWrite() {
p.TpObject = exporter.NewTPCnt(fmt.Sprintf("PrimaryBackUp_%v", p.GetOpMsg()))
} else if p.IsRandomWrite() {
p.TpObject = exporter.NewTPCnt(fmt.Sprintf("Raft_%v", p.GetOpMsg()))
}
return
}
func (p *Packet) resolveFollowersAddr() (err error) {
defer func() {
if err != nil {
p.PackErrorBody(ActionPreparePkt, err.Error())
}
}()
if len(p.Arg) < int(p.ArgLen) {
err = ErrArgLenMismatch
return
}
str := string(p.Arg[:int(p.ArgLen)])
followerAddrs := strings.SplitN(str, proto.AddrSplit, -1)
followerNum := uint8(len(followerAddrs) - 1)
p.followersAddrs = make([]string, followerNum)
p.followerPackets = make([]*FollowerPacket, followerNum)
p.OrgBuffer = p.Data
if followerNum > 0 {
p.followersAddrs = followerAddrs[:int(followerNum)]
log.LogInfof("action[resolveFollowersAddr] %v", p.followersAddrs)
}
if p.RemainingFollowers < 0 {
err = ErrBadNodes
return
}
return
}
func NewPacket() (p *Packet) {
p = new(Packet)
p.Magic = proto.ProtoMagic
p.StartT = time.Now().UnixNano()
p.NeedReply = true
return
}
func NewPacketToGetAllWatermarks(partitionID uint64, extentType uint8) (p *Packet) {
p = new(Packet)
p.Opcode = proto.OpGetAllWatermarks
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ReqID = proto.GenerateRequestID()
p.ExtentType = extentType
return
}
func NewPacketToReadTinyDeleteRecord(partitionID uint64, offset int64) (p *Packet) {
p = new(Packet)
p.Opcode = proto.OpReadTinyDeleteRecord
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ReqID = proto.GenerateRequestID()
p.ExtentOffset = offset
return
}
func NewReadTinyDeleteRecordResponsePacket(requestID int64, partitionID uint64) (p *Packet) {
p = new(Packet)
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpOk
p.ReqID = requestID
p.ExtentType = proto.NormalExtentType
return
}
func NewExtentRepairReadPacket(partitionID uint64, extentID uint64, offset, size int) (p *Packet) {
p = new(Packet)
p.ExtentID = extentID
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ExtentOffset = int64(offset)
p.Size = uint32(size)
p.Opcode = proto.OpExtentRepairRead
p.ExtentType = proto.NormalExtentType
p.ReqID = proto.GenerateRequestID()
return
}
func NewTinyExtentRepairReadPacket(partitionID uint64, extentID uint64, offset, size int) (p *Packet) {
p = new(Packet)
p.ExtentID = extentID
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ExtentOffset = int64(offset)
p.Size = uint32(size)
p.Opcode = proto.OpTinyExtentRepairRead
p.ExtentType = proto.TinyExtentType
p.ReqID = proto.GenerateRequestID()
return
}
func NewTinyExtentStreamReadResponsePacket(requestID int64, partitionID uint64, extentID uint64) (p *Packet) {
p = new(Packet)
p.ExtentID = extentID
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpTinyExtentRepairRead
p.ReqID = requestID
p.ExtentType = proto.TinyExtentType
p.StartT = time.Now().UnixNano()
return
}
func NewStreamReadResponsePacket(requestID int64, partitionID uint64, extentID uint64) (p *Packet) {
p = new(Packet)
p.ExtentID = extentID
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpOk
p.ReqID = requestID
p.ExtentType = proto.NormalExtentType
return
}
func NewPacketToNotifyExtentRepair(partitionID uint64) (p *Packet) {
p = new(Packet)
p.Opcode = proto.OpNotifyReplicasToRepair
p.PartitionID = partitionID
p.Magic = proto.ProtoMagic
p.ExtentType = proto.NormalExtentType
p.ReqID = proto.GenerateRequestID()
return
}
func (p *Packet) IsErrPacket() bool {
return p.ResultCode != proto.OpOk && p.ResultCode != proto.OpInitResultCode
}
func (p *Packet) getErrMessage() (m string) {
return fmt.Sprintf("req(%v) err(%v)", p.GetUniqueLogId(), string(p.Data[:p.Size]))
}
var ErrorUnknownOp = errors.New("unknown opcode")
func (p *Packet) identificationErrorResultCode(errLog string, errMsg string) {
log.LogDebugf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
if strings.Contains(errLog, ActionReceiveFromFollower) || strings.Contains(errLog, ActionSendToFollowers) ||
strings.Contains(errLog, ConnIsNullErr) {
p.ResultCode = proto.OpIntraGroupNetErr
} else if strings.Contains(errMsg, storage.ParameterMismatchError.Error()) ||
strings.Contains(errMsg, ErrorUnknownOp.Error()) {
p.ResultCode = proto.OpArgMismatchErr
} else if strings.Contains(errMsg, proto.ErrDataPartitionNotExists.Error()) {
p.ResultCode = proto.OpTryOtherAddr
} else if strings.Contains(errMsg, storage.ExtentNotFoundError.Error()) ||
strings.Contains(errMsg, storage.ExtentHasBeenDeletedError.Error()) {
p.ResultCode = proto.OpNotExistErr
} else if strings.Contains(errMsg, storage.NoSpaceError.Error()) {
p.ResultCode = proto.OpDiskNoSpaceErr
} else if strings.Contains(errMsg, storage.BrokenDiskError.Error()) {
p.ResultCode = proto.OpDiskErr
} else if strings.Contains(errMsg, storage.TryAgainError.Error()) {
p.ResultCode = proto.OpAgain
} else if strings.Contains(errMsg, raft.ErrNotLeader.Error()) {
p.ResultCode = proto.OpTryOtherAddr
} else if strings.Contains(errMsg, raft.ErrStopped.Error()) {
p.ResultCode = proto.OpTryOtherAddr
} else if strings.Contains(errMsg, storage.VerNotConsistentError.Error()) {
p.ResultCode = proto.ErrCodeVersionOpError
// log.LogDebugf("action[identificationErrorResultCode] not change ver erro code, (%v)", string(debug.Stack()))
} else {
log.LogErrorf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
p.ResultCode = proto.OpIntraGroupNetErr
}
}
func (p *Packet) PackErrorBody(action, msg string) {
p.identificationErrorResultCode(action, msg)
p.Size = uint32(len([]byte(action + "_" + msg)))
p.Data = make([]byte, p.Size)
copy(p.Data[:int(p.Size)], []byte(action+"_"+msg))
}
func (p *Packet) ReadFull(c net.Conn, opcode uint8, readSize int) (err error) {
if p.IsNormalWriteOperation() && readSize == util.BlockSize {
p.Data, _ = proto.Buffers.Get(readSize)
} else {
p.Data = make([]byte, readSize)
}
_, err = io.ReadFull(c, p.Data[:readSize])
return
}
func (p *Packet) IsMasterCommand() bool {
switch p.Opcode {
case
proto.OpDataNodeHeartbeat,
proto.OpVersionOperation,
proto.OpLoadDataPartition,
proto.OpCreateDataPartition,
proto.OpDeleteDataPartition,
proto.OpDecommissionDataPartition,
proto.OpAddDataPartitionRaftMember,
proto.OpRemoveDataPartitionRaftMember,
proto.OpDataPartitionTryToLeader:
return true
default:
return false
}
}
func (p *Packet) IsForwardPacket() bool {
r := p.RemainingFollowers > 0 && !p.isSpecialReplicaCntPacket()
return r
}
func (p *Packet) isSpecialReplicaCntPacket() bool {
r := p.RemainingFollowers == 127
return r
}
// A leader packet is the packet send to the leader and does not require packet forwarding.
func (p *Packet) IsLeaderPacket() (ok bool) {
if (p.IsForwardPkt() || p.isSpecialReplicaCntPacket()) &&
(p.IsNormalWriteOperation() || p.IsCreateExtentOperation() || p.IsMarkDeleteExtentOperation()) {
ok = true
}
return
}
func (p *Packet) IsTinyExtentType() bool {
return p.ExtentType == proto.TinyExtentType
}
func (p *Packet) IsNormalWriteOperation() bool {
return p.Opcode == proto.OpWrite || p.Opcode == proto.OpSyncWrite
}
func (p *Packet) IsSnapshotModWriteAppendOperation() bool {
return p.Opcode == proto.OpRandomWriteAppend || p.Opcode == proto.OpSyncRandomWriteAppend
}
func (p *Packet) IsCreateExtentOperation() bool {
return p.Opcode == proto.OpCreateExtent
}
func (p *Packet) IsMarkDeleteExtentOperation() bool {
return p.Opcode == proto.OpMarkDelete || p.Opcode == proto.OpSplitMarkDelete
}
func (p *Packet) IsMarkSplitExtentOperation() bool {
return p.Opcode == proto.OpSplitMarkDelete
}
func (p *Packet) IsBatchDeleteExtents() bool {
return p.Opcode == proto.OpBatchDeleteExtent
}
func (p *Packet) IsBroadcastMinAppliedID() bool {
return p.Opcode == proto.OpBroadcastMinAppliedID
}
func (p *Packet) IsRandomWrite() bool {
return p.Opcode == proto.OpRandomWrite || p.Opcode == proto.OpSyncRandomWrite ||
p.Opcode == proto.OpRandomWriteVer || p.Opcode == proto.OpSyncRandomWriteVer
}
func (p *Packet) IsSyncWrite() bool {
return p.Opcode == proto.OpSyncWrite || p.Opcode == proto.OpSyncRandomWrite
}
func (p *Packet) SetDegrade() {
p.shallDegrade = true
}
func (p *Packet) UnsetDegrade() {
p.shallDegrade = false
}
func (p *Packet) ShallDegrade() bool {
return p.shallDegrade
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package repl
import (
"container/list"
"fmt"
"net"
"os"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
var gConnPool = util.NewConnectPool()
// ReplProtocol defines the struct of the replication protocol.
// 1. ServerConn reads a packet from the client socket, and analyzes the addresses of the followers.
// 2. After the preparation, the packet is send to toBeProcessedCh. If failure happens, send it to the response channel.
// 3. OperatorAndForwardPktGoRoutine fetches a packet from toBeProcessedCh, and determine if it needs to be forwarded to the followers.
// 4. receiveResponse fetches a reply from responseCh, executes postFunc, and writes a response to the client if necessary.
type ReplProtocol struct {
packetListLock sync.RWMutex
packetList *list.List // stores all the received packets from the client
ackCh chan struct{} // if sending to all the replicas succeeds, then a signal to this channel
toBeProcessedCh chan *Packet // the goroutine receives an available packet and then sends it to this channel
responseCh chan *Packet // this chan is used to write response to the client
sourceConn net.Conn
exitC chan bool
exited int32
exitedMu sync.RWMutex
followerConnects map[string]*FollowerTransport
lock sync.RWMutex
prepareFunc func(p *Packet) error // prepare packet
operatorFunc func(p *Packet, c net.Conn) error // operator
postFunc func(p *Packet) error // post-processing packet
getSmuxConn func(addr string) (c net.Conn, err error)
putSmuxConn func(conn net.Conn, force bool)
isError int32
replId int64
}
type FollowerTransport struct {
addr string
conn net.Conn
sendCh chan *FollowerPacket
recvCh chan *FollowerPacket
exitCh chan struct{}
exitedMu sync.RWMutex
isclosed int32
}
func NewFollowersTransport(addr string, c net.Conn) (ft *FollowerTransport, err error) {
ft = new(FollowerTransport)
ft.addr = addr
ft.conn = c
ft.sendCh = make(chan *FollowerPacket, 200)
ft.recvCh = make(chan *FollowerPacket, 200)
ft.exitCh = make(chan struct{})
go ft.serverWriteToFollower()
go ft.serverReadFromFollower()
return
}
func (ft *FollowerTransport) serverWriteToFollower() {
for {
select {
case p := <-ft.sendCh:
if err := p.WriteToConn(ft.conn); err != nil {
p.PackErrorBody(ActionSendToFollowers, err.Error())
p.respCh <- fmt.Errorf(string(p.Data[:p.Size]))
log.LogErrorf("serverWriteToFollower ft.addr(%v), err (%v)", ft.addr, err.Error())
ft.conn.Close()
continue
}
ft.recvCh <- p
case <-ft.exitCh:
ft.exitedMu.Lock()
if atomic.AddInt32(&ft.isclosed, -1) == FollowerTransportExited {
ft.conn.Close()
atomic.StoreInt32(&ft.isclosed, FollowerTransportExited)
}
ft.exitedMu.Unlock()
return
}
}
}
func (ft *FollowerTransport) serverReadFromFollower() {
for {
select {
case p := <-ft.recvCh:
ft.readFollowerResult(p)
case <-ft.exitCh:
ft.exitedMu.Lock()
if atomic.AddInt32(&ft.isclosed, -1) == FollowerTransportExited {
ft.conn.Close()
atomic.StoreInt32(&ft.isclosed, FollowerTransportExited)
}
ft.exitedMu.Unlock()
return
}
}
}
// Read the response from the follower
func (ft *FollowerTransport) readFollowerResult(request *FollowerPacket) (err error) {
reply := NewPacket()
defer func() {
reply.clean()
request.respCh <- err
if err != nil {
ft.conn.Close()
}
}()
if request.IsErrPacket() {
err = fmt.Errorf(string(request.Data[:request.Size]))
return
}
timeOut := proto.ReadDeadlineTime
if request.IsBatchDeleteExtents() {
timeOut = proto.BatchDeleteExtentReadDeadLineTime
}
if err = reply.ReadFromConnWithVer(ft.conn, timeOut); err != nil {
log.LogErrorf("readFollowerResult ft.addr(%v), err(%v)", ft.addr, err.Error())
return
}
if reply.ReqID != request.ReqID || reply.PartitionID != request.PartitionID ||
reply.ExtentOffset != request.ExtentOffset || reply.CRC != request.CRC || reply.ExtentID != request.ExtentID {
err = fmt.Errorf(ActionCheckReply+" request(%v), reply(%v) ", request.GetUniqueLogId(),
reply.GetUniqueLogId())
return
}
if reply.IsErrPacket() {
err = fmt.Errorf(string(reply.Data[:reply.Size]))
return
}
log.LogDebugf("action[ActionReceiveFromFollower] %v.", reply.LogMessage(ActionReceiveFromFollower,
ft.addr, request.StartT, err))
return
}
func (ft *FollowerTransport) Destory() {
ft.exitedMu.Lock()
atomic.StoreInt32(&ft.isclosed, FollowerTransportExiting)
close(ft.exitCh)
ft.exitedMu.Unlock()
for {
if atomic.LoadInt32(&ft.isclosed) == FollowerTransportExited {
break
}
time.Sleep(time.Millisecond)
}
close(ft.sendCh)
close(ft.recvCh)
}
func (ft *FollowerTransport) Write(p *FollowerPacket) {
ft.sendCh <- p
}
func NewReplProtocol(inConn net.Conn, prepareFunc func(p *Packet) error,
operatorFunc func(p *Packet, c net.Conn) error, postFunc func(p *Packet) error) *ReplProtocol {
rp := new(ReplProtocol)
rp.packetList = list.New()
rp.ackCh = make(chan struct{}, RequestChanSize)
rp.toBeProcessedCh = make(chan *Packet, RequestChanSize)
rp.responseCh = make(chan *Packet, RequestChanSize)
rp.exitC = make(chan bool, 1)
rp.sourceConn = inConn
rp.followerConnects = make(map[string]*FollowerTransport)
rp.prepareFunc = prepareFunc
rp.operatorFunc = operatorFunc
rp.postFunc = postFunc
rp.exited = ReplRuning
rp.replId = proto.GenerateRequestID()
go rp.OperatorAndForwardPktGoRoutine()
go rp.ReceiveResponseFromFollowersGoRoutine()
go rp.writeResponseToClientGoRroutine()
return rp
}
func (rp *ReplProtocol) SetSmux(f func(addr string) (net.Conn, error), putSmux func(conn net.Conn, force bool)) {
rp.getSmuxConn = f
rp.putSmuxConn = putSmux
}
// ServerConn keeps reading data from the socket to analyze the follower address, execute the prepare function,
// and throw the packets to the to-be-processed channel.
func (rp *ReplProtocol) ServerConn() {
var err error
defer func() {
rp.Stop()
rp.exitedMu.Lock()
if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
rp.sourceConn.Close()
rp.cleanResource()
}
rp.exitedMu.Unlock()
}()
for {
select {
case <-rp.exitC:
return
default:
if err = rp.readPkgAndPrepare(); err != nil {
return
}
}
}
}
// Receive response from all followers.
func (rp *ReplProtocol) ReceiveResponseFromFollowersGoRoutine() {
for {
select {
case <-rp.ackCh:
rp.checkLocalResultAndReciveAllFollowerResponse()
case <-rp.exitC:
rp.exitedMu.Lock()
if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
rp.sourceConn.Close()
rp.cleanResource()
}
rp.exitedMu.Unlock()
return
}
}
}
func (rp *ReplProtocol) setReplProtocolError(request *Packet, index int) {
atomic.StoreInt32(&rp.isError, ReplProtocolError)
}
func (rp *ReplProtocol) hasError() bool {
return atomic.LoadInt32(&rp.isError) == ReplProtocolError
}
func (rp *ReplProtocol) readPkgAndPrepare() (err error) {
request := NewPacket()
if err = request.ReadFromConnWithVer(rp.sourceConn, proto.NoReadDeadlineTime); err != nil {
return
}
// log.LogDebugf("action[readPkgAndPrepare] packet(%v) op %v from remote(%v) conn(%v) ",
// request.GetUniqueLogId(), request.Opcode, rp.sourceConn.RemoteAddr().String(), rp.sourceConn)
if err = request.resolveFollowersAddr(); err != nil {
err = rp.putResponse(request)
return
}
if err = rp.prepareFunc(request); err != nil {
err = rp.putResponse(request)
return
}
err = rp.putToBeProcess(request)
return
}
func (rp *ReplProtocol) sendRequestToAllFollowers(request *Packet) (index int, err error) {
for index = 0; index < len(request.followersAddrs); index++ {
var transport *FollowerTransport
if transport, err = rp.allocateFollowersConns(request, index); err != nil {
request.PackErrorBody(ActionSendToFollowers, err.Error())
return
}
followerRequest := NewFollowerPacket()
copyPacket(request, followerRequest)
followerRequest.RemainingFollowers = 0
request.followerPackets[index] = followerRequest
transport.Write(followerRequest)
}
return
}
// OperatorAndForwardPktGoRoutine reads packets from the to-be-processed channel and writes responses to the client.
// 1. Read a packet from toBeProcessCh, and determine if it needs to be forwarded or not. If the answer is no, then
// process the packet locally and put it into responseCh.
// 2. If the packet needs to be forwarded, the first send it to the followers, and execute the operator function.
// Then notify receiveResponse to read the followers' responses.
// 3. Read a reply from responseCh, and write to the client.
func (rp *ReplProtocol) OperatorAndForwardPktGoRoutine() {
for {
select {
case request := <-rp.toBeProcessedCh:
if !request.IsForwardPacket() {
rp.operatorFunc(request, rp.sourceConn)
rp.putResponse(request)
} else {
index, err := rp.sendRequestToAllFollowers(request)
if err != nil {
rp.setReplProtocolError(request, index)
rp.putResponse(request)
} else {
rp.pushPacketToList(request)
rp.operatorFunc(request, rp.sourceConn)
rp.putAck()
}
}
case <-rp.exitC:
rp.exitedMu.Lock()
if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
rp.sourceConn.Close()
rp.cleanResource()
}
rp.exitedMu.Unlock()
return
}
}
}
func (rp *ReplProtocol) writeResponseToClientGoRroutine() {
for {
select {
case request := <-rp.responseCh:
rp.writeResponse(request)
case <-rp.exitC:
rp.exitedMu.Lock()
if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
rp.sourceConn.Close()
rp.cleanResource()
}
rp.exitedMu.Unlock()
return
}
}
}
// func (rp *ReplProtocol) operatorFuncWithWaitGroup(wg *sync.WaitGroup, request *Packet) {
// defer wg.Done()
// rp.operatorFunc(request, rp.sourceConn)
// }
// Read a packet from the list, scan all the connections of the followers of this packet and read the responses.
// If failed to read the response, then mark the packet as failure, and delete it from the list.
// If all the reads succeed, then mark the packet as success.
func (rp *ReplProtocol) checkLocalResultAndReciveAllFollowerResponse() {
var e *list.Element
if e = rp.getNextPacket(); e == nil {
return
}
response := e.Value.(*Packet)
defer func() {
rp.deletePacket(response, e)
}()
if response.IsErrPacket() {
return
}
// NOTE: wait for all followers
for index := 0; index < len(response.followersAddrs); index++ {
followerPacket := response.followerPackets[index]
err := <-followerPacket.respCh
if err != nil {
// NOTE: we meet timeout error
// set the request status to be timeout
if err == os.ErrDeadlineExceeded {
response.PackErrorBody(ActionReceiveFromFollower, err.Error())
return
}
// NOTE: other errors, continue to receive response from followers
response.PackErrorBody(ActionReceiveFromFollower, err.Error())
continue
}
}
}
// Write a reply to the client.
func (rp *ReplProtocol) writeResponse(reply *Packet) {
var err error
defer func() {
reply.clean()
}()
log.LogDebugf("writeResponse.opcode %v reply %v conn(%v)", reply.Opcode, reply.GetUniqueLogId(), rp.sourceConn.RemoteAddr().String())
if reply.IsErrPacket() {
err = fmt.Errorf(reply.LogMessage(ActionWriteToClient, rp.sourceConn.RemoteAddr().String(),
reply.StartT, fmt.Errorf(string(reply.Data[:reply.Size]))))
if reply.ResultCode == proto.OpNotExistErr || reply.ResultCode == proto.ErrCodeVersionOpError {
log.LogInfof(err.Error())
} else {
log.LogErrorf(err.Error())
}
rp.Stop()
}
log.LogDebugf("try rsp opcode %v %v %v", rp.replId, reply.Opcode, rp.sourceConn.RemoteAddr().String())
// execute the post-processing function
rp.postFunc(reply)
if !reply.NeedReply {
if reply.Opcode == proto.OpTryWriteAppend || reply.Opcode == proto.OpSyncTryWriteAppend {
log.LogDebugf("try rsp opcode %v", reply.Opcode)
}
return
}
if err = reply.WriteToConn(rp.sourceConn); err != nil {
err = fmt.Errorf(reply.LogMessage(ActionWriteToClient, fmt.Sprintf("local(%v)->remote(%v)", rp.sourceConn.LocalAddr().String(),
rp.sourceConn.RemoteAddr().String()), reply.StartT, err))
log.LogErrorf(err.Error())
rp.Stop()
}
log.LogDebugf(reply.LogMessage(ActionWriteToClient,
rp.sourceConn.RemoteAddr().String(), reply.StartT, err))
}
// Stop stops the replication protocol.
func (rp *ReplProtocol) Stop() {
rp.exitedMu.Lock()
defer rp.exitedMu.Unlock()
if atomic.LoadInt32(&rp.exited) == ReplRuning {
if rp.exitC != nil {
close(rp.exitC)
}
atomic.StoreInt32(&rp.exited, ReplExiting)
}
}
type SmuxConn struct {
once sync.Once
net.Conn
put func(conn net.Conn, force bool)
}
func (d *SmuxConn) Close() error {
d.once.Do(func() {
d.put(d.Conn, true)
})
return nil
}
// Allocate the connections to the followers. We use partitionId + extentId + followerAddr as the key.
// Note that we need to ensure the order of packets sent to the datanode is consistent here.
func (rp *ReplProtocol) allocateFollowersConns(p *Packet, index int) (transport *FollowerTransport, err error) {
rp.lock.RLock()
transport = rp.followerConnects[p.followersAddrs[index]]
rp.lock.RUnlock()
if transport == nil {
addr := p.followersAddrs[index]
var conn net.Conn
if (p.IsMarkDeleteExtentOperation() || p.IsBatchDeleteExtents()) && rp.getSmuxConn != nil {
var smuxCon net.Conn
smuxCon, err = rp.getSmuxConn(addr)
if err != nil {
return
}
conn = &SmuxConn{
Conn: smuxCon,
put: rp.putSmuxConn,
}
} else {
conn, err = gConnPool.GetConnect(addr)
if err != nil {
return
}
}
transport, err = NewFollowersTransport(addr, conn)
if err != nil {
return
}
rp.lock.Lock()
rp.followerConnects[p.followersAddrs[index]] = transport
rp.lock.Unlock()
}
return
}
func (rp *ReplProtocol) getNextPacket() (e *list.Element) {
rp.packetListLock.RLock()
e = rp.packetList.Front()
rp.packetListLock.RUnlock()
return
}
func (rp *ReplProtocol) pushPacketToList(e *Packet) {
rp.packetListLock.Lock()
rp.packetList.PushBack(e)
rp.packetListLock.Unlock()
}
func (rp *ReplProtocol) cleanToBeProcessCh() {
request := len(rp.toBeProcessedCh)
for i := 0; i < request; i++ {
select {
case p := <-rp.toBeProcessedCh:
rp.postFunc(p)
p.clean()
default:
return
}
}
}
func (rp *ReplProtocol) cleanResponseCh() {
replys := len(rp.responseCh)
for i := 0; i < replys; i++ {
select {
case p := <-rp.responseCh:
rp.postFunc(p)
p.clean()
default:
return
}
}
}
// If the replication protocol exits, then clear all the packet resources.
func (rp *ReplProtocol) cleanResource() {
rp.packetListLock.Lock()
for e := rp.packetList.Front(); e != nil; e = e.Next() {
request := e.Value.(*Packet)
rp.postFunc(request)
request.clean()
}
rp.cleanToBeProcessCh()
rp.cleanResponseCh()
rp.packetList = list.New()
rp.lock.RLock()
for _, transport := range rp.followerConnects {
transport.Destory()
}
rp.lock.RUnlock()
close(rp.responseCh)
close(rp.toBeProcessedCh)
close(rp.ackCh)
rp.packetList = nil
rp.followerConnects = nil
rp.packetListLock.Unlock()
}
func (rp *ReplProtocol) deletePacket(reply *Packet, e *list.Element) (success bool) {
rp.packetListLock.Lock()
defer rp.packetListLock.Unlock()
rp.packetList.Remove(e)
success = true
rp.putResponse(reply)
return
}
func (rp *ReplProtocol) putResponse(reply *Packet) (err error) {
select {
case rp.responseCh <- reply:
return
default:
return fmt.Errorf("response Chan has full (%v)", len(rp.responseCh))
}
}
func (rp *ReplProtocol) putToBeProcess(request *Packet) (err error) {
select {
case rp.toBeProcessedCh <- request:
return
default:
return fmt.Errorf("toBeProcessedCh Chan has full (%v)", len(rp.toBeProcessedCh))
}
}
func (rp *ReplProtocol) putAck() (err error) {
select {
case rp.ackCh <- struct{}{}:
return
default:
return fmt.Errorf("ack Chan has full (%v)", len(rp.ackCh))
}
}
package auth
import (
"encoding/json"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/auth"
"github.com/cubefs/cubefs/util/cryptoutil"
)
func (api *API) GetTicket(clientId string, clientKey string, serviceID string) (ticket *auth.Ticket, err error) {
var (
key []byte
ts int64
msgResp proto.AuthGetTicketResp
respData []byte
)
message := proto.AuthGetTicketReq{
Type: proto.MsgAuthTicketReq,
ClientID: clientId,
ServiceID: serviceID,
}
if key, err = cryptoutil.Base64Decode(clientKey); err != nil {
return
}
if message.Verifier, ts, err = cryptoutil.GenVerifier(key); err != nil {
return
}
if respData, err = api.ac.request(clientId, clientKey, key, message, proto.ClientGetTicket, serviceID); err != nil {
return
}
if err = json.Unmarshal(respData, &msgResp); err != nil {
return
}
if err = proto.VerifyTicketRespComm(&msgResp, proto.MsgAuthTicketReq, clientId, serviceID, ts); err != nil {
return
}
ticket = &auth.Ticket{
ID: clientId,
SessionKey: cryptoutil.Base64Encode(msgResp.SessionKey.Key),
ServiceID: cryptoutil.Base64Encode(msgResp.SessionKey.Key),
Ticket: msgResp.Ticket,
}
return
}
package auth
import (
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/keystore"
)
func (api *API) AdminCreateKey(clientID, clientKey, userID, role string, caps []byte) (res *keystore.KeyInfo, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
keyInfo := &keystore.KeyInfo{
ID: userID,
Role: role,
Caps: caps,
}
return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthCreateKeyReq, proto.AdminCreateKey)
}
func (api *API) AdminDeleteKey(clientID, clientKey, userID string) (res *keystore.KeyInfo, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
keyInfo := &keystore.KeyInfo{
ID: userID,
}
return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthDeleteKeyReq, proto.AdminDeleteKey)
}
func (api *API) AdminGetKey(clientID, clientKey, userID string) (res *keystore.KeyInfo, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
keyInfo := &keystore.KeyInfo{
ID: userID,
}
return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthGetKeyReq, proto.AdminGetKey)
}
func (api *API) AdminAddCaps(clientID, clientKey, userID string, caps []byte) (res *keystore.KeyInfo, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
keyInfo := &keystore.KeyInfo{
ID: userID,
Caps: caps,
}
return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthAddCapsReq, proto.AdminAddCaps)
}
func (api *API) AdminDeleteCaps(clientID, clientKey, userID string, caps []byte) (res *keystore.KeyInfo, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
keyInfo := &keystore.KeyInfo{
ID: userID,
Caps: caps,
}
return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthDeleteCapsReq, proto.AdminDeleteCaps)
}
func (api *API) AdminGetCaps(clientID, clientKey, userID string) (res *keystore.KeyInfo, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
keyInfo := &keystore.KeyInfo{
ID: userID,
}
return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthGetCapsReq, proto.AdminGetCaps)
}
package auth
import (
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/keystore"
)
type API struct {
ac *AuthClient
}
func (api *API) OSSAddCaps(clientID, clientKey, accessKey string, caps []byte) (newAKCaps *keystore.AccessKeyCaps, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
akCaps := &keystore.AccessKeyCaps{
AccessKey: accessKey,
Caps: caps,
}
return api.ac.serveOSSRequest(clientID, clientKey, api.ac.ticket, akCaps, proto.MsgAuthOSAddCapsReq, proto.OSAddCaps)
}
func (api *API) OSSDeleteCaps(clientID, clientKey, accessKey string, caps []byte) (newAKCaps *keystore.AccessKeyCaps, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
akCaps := &keystore.AccessKeyCaps{
AccessKey: accessKey,
Caps: caps,
}
return api.ac.serveOSSRequest(clientID, clientKey, api.ac.ticket, akCaps, proto.MsgAuthOSDeleteCapsReq, proto.OSDeleteCaps)
}
func (api *API) OSSGetCaps(clientID, clientKey, accessKey string) (caps *keystore.AccessKeyCaps, err error) {
if api.ac.ticket == nil {
if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
return
}
}
akCaps := &keystore.AccessKeyCaps{
AccessKey: accessKey,
}
return api.ac.serveOSSRequest(clientID, clientKey, api.ac.ticket, akCaps, proto.MsgAuthOSGetCapsReq, proto.OSGetCaps)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package auth
import (
"encoding/json"
"fmt"
"net/http"
"os"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/auth"
"github.com/cubefs/cubefs/util/cryptoutil"
"github.com/cubefs/cubefs/util/keystore"
"github.com/cubefs/cubefs/util/log"
)
const (
requestTimeout = 30 * time.Second
RequestMaxRetry = 5
RequestSleepInterval = 100 * time.Millisecond
)
type AuthClient struct {
sync.RWMutex
authnodes []string
enableHTTPS bool
certFile string
ticket *auth.Ticket
leaderAddr string
}
func (c *AuthClient) API() *API {
return &API{
ac: c,
}
}
func NewAuthClient(authNodes []string, enableHTTPS bool, certFile string) *AuthClient {
return &AuthClient{authnodes: authNodes, enableHTTPS: enableHTTPS, certFile: certFile}
}
func (c *AuthClient) request(clientID, clientKey string, key []byte, data interface{}, path, serviceID string) (respData []byte, err error) {
var (
body []byte
urlProto string
url string
client *http.Client
certFile []byte
)
if c.enableHTTPS {
urlProto = "https://"
if certFile, err = loadCertfile(c.certFile); err != nil {
err = fmt.Errorf("load cert file failed: %v, certFile[%v]", err, c.certFile)
log.LogWarnf("%v", err)
return
}
client, err = cryptoutil.CreateClientX(&certFile)
if err != nil {
return
}
} else {
urlProto = "http://"
client = &http.Client{}
}
// TODO don't retry if the param is wrong
for i := 0; i < RequestMaxRetry; i++ {
for _, ip := range c.authnodes {
url = urlProto + ip + path
body, err = proto.SendData(client, url, data)
if err != nil {
continue
}
var jobj *proto.HTTPAuthReply
if err = json.Unmarshal(body, &jobj); err != nil {
return nil, fmt.Errorf("unmarshal response body err:%v", err)
}
if jobj.Code != 0 {
if jobj.Code == proto.ErrCodeExpiredTicket {
c.ticket, err = c.API().GetTicket(clientID, clientKey, serviceID)
if err == nil {
c.request(clientID, clientKey, key, data, path, serviceID)
}
}
err = fmt.Errorf(jobj.Msg)
return nil, fmt.Errorf("request error, code[%d], msg[%s]", jobj.Code, err)
}
data := fmt.Sprint(jobj.Data)
if respData, err = cryptoutil.DecodeMessage(data, key); err != nil {
return nil, fmt.Errorf("decode message error: %v", err)
}
return
}
log.LogWarnf("Request authnode: getReply error and will RETRY, url(%v) err(%v)", url, err)
time.Sleep(RequestSleepInterval)
}
log.LogWarnf("Request authnode exit: send to addr(%v) err(%v)", url, err)
return nil, fmt.Errorf("Request authnode: getReply error, url(%v) err(%v)", url, err)
}
func (c *AuthClient) serveOSSRequest(id, key string, ticket *auth.Ticket, akCaps *keystore.AccessKeyCaps, reqType proto.MsgType, reqPath string) (caps *keystore.AccessKeyCaps, err error) {
var (
sessionKey []byte
ts int64
resp proto.AuthOSAccessKeyResp
respData []byte
)
apiReq := &proto.APIAccessReq{
Type: reqType,
ClientID: id,
ServiceID: proto.AuthServiceID,
Ticket: ticket.Ticket,
}
if sessionKey, err = cryptoutil.Base64Decode(ticket.SessionKey); err != nil {
return nil, err
}
if apiReq.Verifier, ts, err = cryptoutil.GenVerifier(sessionKey); err != nil {
return nil, err
}
message := &proto.AuthOSAccessKeyReq{
APIReq: *apiReq,
AKCaps: *akCaps,
}
if respData, err = c.request(id, key, sessionKey, message, reqPath, proto.AuthServiceID); err != nil {
return
}
if err = json.Unmarshal(respData, &resp); err != nil {
return
}
if err = proto.VerifyAPIRespComm(&resp.APIResp, reqType, id, proto.AuthServiceID, ts); err != nil {
return
}
return &resp.AKCaps, err
}
func (c *AuthClient) serveAdminRequest(id, key string, ticket *auth.Ticket, keyInfo *keystore.KeyInfo, reqType proto.MsgType, reqPath string) (res *keystore.KeyInfo, err error) {
var (
sessionKey []byte
ts int64
resp proto.AuthAPIAccessResp
respData []byte
)
apiReq := &proto.APIAccessReq{
Type: reqType,
ClientID: id,
ServiceID: proto.AuthServiceID,
Ticket: ticket.Ticket,
}
if sessionKey, err = cryptoutil.Base64Decode(ticket.SessionKey); err != nil {
return nil, err
}
if apiReq.Verifier, ts, err = cryptoutil.GenVerifier(sessionKey); err != nil {
return nil, err
}
message := &proto.AuthAPIAccessReq{
APIReq: *apiReq,
KeyInfo: *keyInfo,
}
if respData, err = c.request(id, key, sessionKey, message, reqPath, proto.AuthServiceID); err != nil {
return
}
if err = json.Unmarshal(respData, &resp); err != nil {
return
}
if err = proto.VerifyAPIRespComm(&resp.APIResp, reqType, id, proto.AuthServiceID, ts); err != nil {
return
}
return &resp.KeyInfo, err
}
func loadCertfile(path string) (caCert []byte, err error) {
caCert, err = os.ReadFile(path)
if err != nil {
return
}
return
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package blobstore
import (
"bytes"
"context"
"io"
"time"
"github.com/cubefs/cubefs/blobstore/api/access"
"github.com/cubefs/cubefs/blobstore/common/codemode"
ebsproto "github.com/cubefs/cubefs/blobstore/common/proto"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
"github.com/google/uuid"
)
const (
MaxRetryTimes = 3
RetrySleepInterval = 100 * time.Millisecond
)
type BlobStoreClient struct {
client access.API
}
func NewEbsClient(cfg access.Config) (*BlobStoreClient, error) {
cli, err := access.New(cfg)
return &BlobStoreClient{
client: cli,
}, err
}
func (ebs *BlobStoreClient) Read(ctx context.Context, volName string, buf []byte, offset uint64, size uint64, oek proto.ObjExtentKey) (readN int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ebs-read", err, bgTime, 1)
}()
requestId := uuid.New().String()
log.LogDebugf("TRACE Ebs Read Enter requestId(%v), oek(%v)", requestId, oek)
ctx = access.WithRequestID(ctx, requestId)
start := time.Now()
metric := exporter.NewTPCnt(createOPMetric(buf, "ebsread"))
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: volName})
}()
blobs := oek.Blobs
sliceInfos := make([]access.SliceInfo, 0)
for _, b := range blobs {
sliceInfo := access.SliceInfo{
MinBid: ebsproto.BlobID(b.MinBid),
Vid: ebsproto.Vid(b.Vid),
Count: uint32(b.Count),
}
sliceInfos = append(sliceInfos, sliceInfo)
}
loc := access.Location{
ClusterID: ebsproto.ClusterID(oek.Cid),
Size: oek.Size,
Crc: oek.Crc,
CodeMode: codemode.CodeMode(oek.CodeMode),
BlobSize: oek.BlobSize,
Blobs: sliceInfos,
}
// func get has retry
log.LogDebugf("TRACE Ebs Read,oek(%v) loc(%v)", oek, loc)
var body io.ReadCloser
defer func() {
if body != nil {
body.Close()
}
}()
for i := 0; i < MaxRetryTimes; i++ {
body, err = ebs.client.Get(ctx, &access.GetArgs{Location: loc, Offset: offset, ReadSize: size})
if err == nil {
break
}
log.LogWarnf("TRACE Ebs Read,oek(%v), err(%v), requestId(%v),retryTimes(%v)", oek, err, requestId, i)
time.Sleep(RetrySleepInterval)
}
if err != nil {
log.LogErrorf("TRACE Ebs Read,oek(%v), err(%v), requestId(%v)", oek, err, requestId)
return 0, err
}
readN, err = io.ReadFull(body, buf)
if err != nil {
log.LogErrorf("TRACE Ebs Read,oek(%v), err(%v), requestId(%v)", oek, err, requestId)
return 0, err
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Ebs Read Exit,oek(%v) readN(%v),bufLen(%v),consume(%v)ns", oek, readN, len(buf), elapsed.Nanoseconds())
return readN, nil
}
func (ebs *BlobStoreClient) Write(ctx context.Context, volName string, data []byte, size uint32) (location access.Location, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ebs-write", err, bgTime, 1)
}()
requestId := uuid.New().String()
log.LogDebugf("TRACE Ebs Write Enter,requestId(%v) len(%v)", requestId, size)
start := time.Now()
ctx = access.WithRequestID(ctx, requestId)
metric := exporter.NewTPCnt(createOPMetric(data, "ebswrite"))
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: volName})
}()
for i := 0; i < MaxRetryTimes; i++ {
location, _, err = ebs.client.Put(ctx, &access.PutArgs{
Size: int64(size),
Body: bytes.NewReader(data),
})
if err == nil {
break
}
log.LogWarnf("TRACE Ebs write, err(%v), requestId(%v),retryTimes(%v)", err, requestId, i)
time.Sleep(RetrySleepInterval)
}
if err != nil {
log.LogErrorf("TRACE Ebs write,err(%v),requestId(%v)", err.Error(), requestId)
return location, err
}
elapsed := time.Since(start)
log.LogDebugf("TRACE Ebs Write Exit,requestId(%v) len(%v) consume(%v)ns", requestId, len(data), elapsed.Nanoseconds())
return location, nil
}
func (ebs *BlobStoreClient) Delete(oeks []proto.ObjExtentKey) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ebs-delete", err, bgTime, 1)
}()
ctx, cancel := context.WithTimeout(context.TODO(), time.Second*3)
defer cancel()
locs := make([]access.Location, 0)
for _, oek := range oeks {
sliceInfos := make([]access.SliceInfo, 0)
for _, b := range oek.Blobs {
sliceInfo := access.SliceInfo{
MinBid: ebsproto.BlobID(b.MinBid),
Vid: ebsproto.Vid(b.Vid),
Count: uint32(b.Count),
}
sliceInfos = append(sliceInfos, sliceInfo)
}
loc := access.Location{
ClusterID: ebsproto.ClusterID(oek.Cid),
Size: oek.Size,
Crc: oek.Crc,
CodeMode: codemode.CodeMode(oek.CodeMode),
BlobSize: oek.BlobSize,
Blobs: sliceInfos,
}
locs = append(locs, loc)
}
requestId := uuid.New().String()
log.LogDebugf("start Ebs delete Enter,requestId(%v) len(%v)", requestId, len(oeks))
start := time.Now()
ctx = access.WithRequestID(ctx, requestId)
metric := exporter.NewTPCnt("ebsdel")
defer func() {
metric.SetWithLabels(err, map[string]string{})
}()
elapsed := time.Since(start)
_, err = ebs.client.Delete(ctx, &access.DeleteArgs{Locations: locs})
if err != nil {
log.LogErrorf("[EbsDelete] Ebs delete error, id(%v), consume(%v)ns, err(%v)", requestId, elapsed.Nanoseconds(), err.Error())
return err
}
log.LogDebugf("Ebs delete Exit,requestId(%v) len(%v) consume(%v)ns", requestId, len(oeks), elapsed.Nanoseconds())
return err
}
func createOPMetric(buf []byte, tag string) string {
if len(buf) >= 0 && len(buf) < 4*util.KB {
return tag + "0K_4K"
} else if len(buf) >= 4*util.KB && len(buf) < 128*util.KB {
return tag + "4K_128K"
} else if len(buf) >= 128*util.KB && len(buf) < 1*util.MB {
return tag + "128K_1M"
} else if len(buf) >= 1*util.MB && len(buf) < 4*util.MB {
return tag + "1M_4M"
}
return tag + "4M_8M"
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package blobstore
import (
"context"
"fmt"
"io"
"os"
"sync"
"syscall"
"time"
"github.com/cubefs/cubefs/blockcache/bcache"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/manager"
"github.com/cubefs/cubefs/sdk/data/stream"
"github.com/cubefs/cubefs/sdk/meta"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
type rwSlice struct {
index int
fileOffset uint64
size uint32
rOffset uint64
rSize uint32
read int
Data []byte
extentKey proto.ExtentKey
objExtentKey proto.ObjExtentKey
}
func (s rwSlice) String() string {
return fmt.Sprintf("rwSlice{fileOffset(%v),size(%v),rOffset(%v),rSize(%v),read(%v),extentKey(%v),objExtentKey(%v)}", s.fileOffset, s.size, s.rOffset, s.rSize, s.read, s.extentKey, s.objExtentKey)
}
func (reader *Reader) String() string {
return fmt.Sprintf("Reader{address(%v),volName(%v),volType(%v),ino(%v),fileSize(%v),enableBcache(%v),cacheAction(%v),fileCache(%v),cacheThreshold(%v)},readConcurrency(%v)",
&reader, reader.volName, reader.volType, reader.ino, reader.fileLength, reader.enableBcache, reader.cacheAction, reader.fileCache, reader.cacheThreshold, reader.readConcurrency)
}
type Reader struct {
volName string
volType int
ino uint64
offset uint64
data []byte
err chan error
bc *bcache.BcacheClient
mw *meta.MetaWrapper
ec *stream.ExtentClient
ebs *BlobStoreClient
readConcurrency int
cacheTimeout time.Duration
wg sync.WaitGroup
once sync.Once
sync.Mutex
close bool
extentKeys []proto.ExtentKey
missExtentKeys []proto.ExtentKey
objExtentKeys []proto.ObjExtentKey
enableBcache bool
cacheAction int
fileCache bool
cacheThreshold int
fileLength uint64
valid bool
inflightL2cache sync.Map
limitManager *manager.LimitManager
}
type ClientConfig struct {
VolName string
VolType int
BlockSize int
Ino uint64
Bc *bcache.BcacheClient
Mw *meta.MetaWrapper
Ec *stream.ExtentClient
Ebsc *BlobStoreClient
EnableBcache bool
WConcurrency int
ReadConcurrency int
CacheAction int
FileCache bool
FileSize uint64
CacheThreshold int
}
func NewReader(config ClientConfig) (reader *Reader) {
reader = new(Reader)
reader.volName = config.VolName
reader.volType = config.VolType
reader.ino = config.Ino
reader.bc = config.Bc
reader.ebs = config.Ebsc
reader.mw = config.Mw
reader.ec = config.Ec
reader.enableBcache = config.EnableBcache
reader.readConcurrency = config.ReadConcurrency
reader.cacheAction = config.CacheAction
reader.fileCache = config.FileCache
reader.cacheThreshold = config.CacheThreshold
if proto.IsCold(reader.volType) {
reader.ec.UpdateDataPartitionForColdVolume()
}
reader.limitManager = reader.ec.LimitManager
return
}
func (reader *Reader) Read(ctx context.Context, buf []byte, offset int, size int) (int, error) {
if reader == nil {
return 0, fmt.Errorf("reader is not opened yet")
}
log.LogDebugf("TRACE reader Read Enter. ino(%v) offset(%v) len(%v)", reader.ino, offset, size)
var (
read = 0
err error
)
if reader.close {
return 0, os.ErrInvalid
}
reader.Lock()
defer reader.Unlock()
// cold volume,slice read
var rSlices []*rwSlice
if size != len(buf) {
size = len(buf)
}
rSlices, err = reader.prepareEbsSlice(offset, uint32(size))
log.LogDebugf("TRACE reader Read. ino(%v) rSlices-length(%v) ", reader.ino, len(rSlices))
if err != nil {
return 0, err
}
sliceSize := len(rSlices)
if sliceSize > 0 {
reader.wg.Add(sliceSize)
pool := New(reader.readConcurrency, sliceSize)
defer pool.Close()
reader.err = make(chan error, sliceSize)
for _, rs := range rSlices {
pool.Execute(rs, func(param *rwSlice) {
reader.readSliceRange(ctx, param)
})
}
reader.wg.Wait()
for i := 0; i < sliceSize; i++ {
if err, ok := <-reader.err; !ok || err != nil {
return 0, err
}
}
close(reader.err)
}
for i := 0; i < sliceSize; i++ {
read += copy(buf[read:], rSlices[i].Data)
}
log.LogDebugf("TRACE reader Read Exit. ino(%v) readN(%v) buf-len(%v)", reader.ino, read, len(buf))
return read, nil
}
func (reader *Reader) Close(ctx context.Context) {
reader.Lock()
reader.close = true
reader.Unlock()
}
func (reader *Reader) prepareEbsSlice(offset int, size uint32) ([]*rwSlice, error) {
if offset < 0 {
return nil, syscall.EIO
}
chunks := make([]*rwSlice, 0)
endflag := false
selected := false
reader.once.Do(func() {
reader.refreshEbsExtents()
})
fileSize, valid := reader.fileSize()
reader.fileLength = fileSize
log.LogDebugf("TRACE blobStore prepareEbsSlice Enter. ino(%v) fileSize(%v) ", reader.ino, fileSize)
if !valid {
log.LogErrorf("Reader: invoke fileSize fail. ino(%v) offset(%v) size(%v)", reader.ino, offset, size)
return nil, syscall.EIO
}
log.LogDebugf("TRACE blobStore prepareEbsSlice. ino(%v) offset(%v) size(%v)", reader.ino, offset, size)
if uint64(offset) >= fileSize {
return nil, io.EOF
}
start := uint64(offset)
if uint64(offset)+uint64(size) > fileSize {
size = uint32(fileSize - uint64(offset))
}
end := uint64(offset + int(size))
for index, oek := range reader.objExtentKeys {
rs := &rwSlice{}
if oek.FileOffset <= start && start < oek.FileOffset+(oek.Size) {
rs.index = index
rs.fileOffset = oek.FileOffset
rs.size = uint32(oek.Size)
rs.rOffset = start - oek.FileOffset
rs.rSize = uint32(oek.FileOffset + oek.Size - start)
selected = true
}
if end <= oek.FileOffset+oek.Size {
rs.rSize = uint32(end - start)
selected = true
endflag = true
}
if selected {
rs.objExtentKey = oek
reader.buildExtentKey(rs)
rs.Data = make([]byte, rs.rSize)
start = oek.FileOffset + oek.Size
chunks = append(chunks, rs)
log.LogDebugf("TRACE blobStore prepareEbsSlice. ino(%v) offset(%v) size(%v) rwSlice(%v)", reader.ino, offset, size, rs)
}
if endflag {
break
}
}
log.LogDebugf("TRACE blobStore prepareEbsSlice Exit. ino(%v) offset(%v) size(%v) rwSlices(%v)", reader.ino, offset, size, chunks)
return chunks, nil
}
func (reader *Reader) buildExtentKey(rs *rwSlice) {
if len(reader.extentKeys) <= 0 {
rs.extentKey = proto.ExtentKey{}
} else {
low := 0
high := len(reader.extentKeys) - 1
for low <= high {
mid := (high + low) / 2
target := reader.extentKeys[mid]
if target.FileOffset == rs.objExtentKey.FileOffset {
rs.extentKey = target
return
} else if target.FileOffset > rs.objExtentKey.FileOffset {
high = mid - 1
} else {
low = mid + 1
}
}
rs.extentKey = proto.ExtentKey{}
}
}
func (reader *Reader) readSliceRange(ctx context.Context, rs *rwSlice) (err error) {
defer reader.wg.Done()
log.LogDebugf("TRACE blobStore readSliceRange Enter. ino(%v) rs.fileOffset(%v),rs.rOffset(%v),rs.rSize(%v) ", reader.ino, rs.fileOffset, rs.rOffset, rs.rSize)
cacheKey := util.GenerateKey(reader.volName, reader.ino, rs.fileOffset)
log.LogDebugf("TRACE blobStore readSliceRange. ino(%v) cacheKey(%v) ", reader.ino, cacheKey)
buf := make([]byte, rs.rSize)
var readN int
bgTime := stat.BeginStat()
stat.EndStat("CacheGet", nil, bgTime, 1)
// all request for each block.
metric := exporter.NewTPCnt("CacheGet")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: reader.volName})
}()
// read local cache
if reader.enableBcache {
readN, err = reader.bc.Get(cacheKey, buf, rs.rOffset, rs.rSize)
if err == nil {
reader.ec.BcacheHealth = true
if readN == int(rs.rSize) {
// L1 cache hit.
metric := exporter.NewTPCnt("L1CacheGetHit")
stat.EndStat("CacheHit-L1", nil, bgTime, 1)
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: reader.volName})
}()
copy(rs.Data, buf)
reader.err <- nil
return
}
}
}
readLimitOn := false
// read cfs and cache to bcache
if rs.extentKey != (proto.ExtentKey{}) {
// check if dp is exist in preload sence
err = reader.ec.CheckDataPartitionExsit(rs.extentKey.PartitionId)
if err == nil || ctx.Value("objectnode") != nil {
readN, err, readLimitOn = reader.ec.ReadExtent(reader.ino, &rs.extentKey, buf, int(rs.rOffset), int(rs.rSize))
if err == nil && readN == int(rs.rSize) {
// L2 cache hit.
metric := exporter.NewTPCnt("L2CacheGetHit")
stat.EndStat("CacheHit-L2", nil, bgTime, 1)
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: reader.volName})
}()
copy(rs.Data, buf)
reader.err <- nil
return
}
} else {
log.LogDebugf("checkDataPartitionExsit failed (%v)", err)
}
log.LogDebugf("TRACE blobStore readSliceRange. cfs block miss.extentKey=%v,err=%v", rs.extentKey, err)
}
if !readLimitOn {
reader.limitManager.ReadAlloc(ctx, int(rs.rSize))
}
readN, err = reader.ebs.Read(ctx, reader.volName, buf, rs.rOffset, uint64(rs.rSize), rs.objExtentKey)
if err != nil {
reader.err <- err
return
}
read := copy(rs.Data, buf)
reader.err <- nil
// cache full block
if !reader.needCacheL1() && !reader.needCacheL2() || reader.ec.IsPreloadMode() {
log.LogDebugf("TRACE blobStore readSliceRange exit without cache. read counter=%v", read)
return nil
}
asyncCtx := context.Background()
go reader.asyncCache(asyncCtx, cacheKey, rs.objExtentKey)
log.LogDebugf("TRACE blobStore readSliceRange exit with cache. read counter=%v", read)
return nil
}
func (reader *Reader) asyncCache(ctx context.Context, cacheKey string, objExtentKey proto.ObjExtentKey) {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("read-async-cache", err, bgTime, 1)
}()
log.LogDebugf("TRACE blobStore asyncCache Enter. cacheKey=%v", cacheKey)
// block is go loading.
if _, ok := reader.inflightL2cache.Load(cacheKey); ok {
return
}
reader.inflightL2cache.Store(cacheKey, true)
defer reader.inflightL2cache.Delete(cacheKey)
buf := make([]byte, objExtentKey.Size)
read, err := reader.ebs.Read(ctx, reader.volName, buf, 0, uint64(len(buf)), objExtentKey)
if err != nil || read != len(buf) {
log.LogErrorf("ERROR blobStore asyncCache fail, size no match. cacheKey=%v, objExtentKey.size=%v, read=%v",
cacheKey, len(buf), read)
return
}
if reader.needCacheL2() {
reader.ec.Write(reader.ino, int(objExtentKey.FileOffset), buf, proto.FlagsCache, nil)
log.LogDebugf("TRACE blobStore asyncCache(L2) Exit. cacheKey=%v", cacheKey)
return
}
if reader.needCacheL1() {
reader.bc.Put(cacheKey, buf)
}
log.LogDebugf("TRACE blobStore asyncCache(L1) Exit. cacheKey=%v", cacheKey)
}
func (reader *Reader) needCacheL2() bool {
if reader.cacheAction > proto.NoCache && reader.fileLength < uint64(reader.cacheThreshold) || reader.fileCache {
return true
}
return false
}
func (reader *Reader) needCacheL1() bool {
return reader.enableBcache
}
func (reader *Reader) refreshEbsExtents() {
_, _, eks, oeks, err := reader.mw.GetObjExtents(reader.ino)
if err != nil {
reader.valid = false
log.LogErrorf("TRACE blobStore refreshEbsExtents error. ino(%v) err(%v) ", reader.ino, err)
return
}
reader.valid = true
reader.extentKeys = eks
reader.objExtentKeys = oeks
log.LogDebugf("TRACE blobStore refreshEbsExtents ok. extentKeys(%v) objExtentKeys(%v) ", reader.extentKeys, reader.objExtentKeys)
}
func (reader *Reader) fileSize() (uint64, bool) {
objKeys := reader.objExtentKeys
if !reader.valid {
return 0, false
}
if len(objKeys) > 0 {
lastIndex := len(objKeys) - 1
return objKeys[lastIndex].FileOffset + objKeys[lastIndex].Size, true
}
return 0, true
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package blobstore
type Instance struct {
mq chan task
}
type task struct {
op *rwSlice
fn func(op *rwSlice)
}
func New(worker int, size int) Instance {
mq := make(chan task, size)
for i := 0; i < worker; i++ {
go func() {
for {
task, ok := <-mq
if !ok {
break
}
task.fn(task.op)
}
}()
}
return Instance{mq}
}
func (r Instance) Execute(op *rwSlice, fn func(op *rwSlice)) {
r.mq <- task{
op: op,
fn: fn,
}
}
func (r Instance) Close() {
close(r.mq)
}
type Executor struct {
tokens chan int
}
func NewExecutor(maxConcurrency int) *Executor {
exec := &Executor{
tokens: make(chan int, maxConcurrency),
}
for i := 0; i < maxConcurrency; i++ {
exec.tokens <- i
}
return exec
}
func (exec *Executor) Run(fn func()) {
i := <-exec.tokens
go func() {
fn()
exec.tokens <- i
}()
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package blobstore
import (
"context"
"fmt"
"hash"
"io"
"sort"
"sync"
"sync/atomic"
"syscall"
"github.com/cubefs/cubefs/blockcache/bcache"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/manager"
"github.com/cubefs/cubefs/sdk/data/stream"
"github.com/cubefs/cubefs/sdk/meta"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/buf"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
const (
MaxBufferSize = 512 * util.MB
)
type wSliceErr struct {
err error
fileOffset uint64
size uint32
}
type Writer struct {
volType int
volName string
blockSize int
ino uint64
err chan *wSliceErr
bc *bcache.BcacheClient
mw *meta.MetaWrapper
ec *stream.ExtentClient
ebsc *BlobStoreClient
wConcurrency int
wg sync.WaitGroup
once sync.Once
sync.RWMutex
enableBcache bool
cacheAction int
buf []byte
fileOffset int
fileCache bool
fileSize uint64
cacheThreshold int
dirty bool
blockPosition int
limitManager *manager.LimitManager
}
func NewWriter(config ClientConfig) (writer *Writer) {
writer = new(Writer)
writer.volName = config.VolName
writer.volType = config.VolType
writer.blockSize = config.BlockSize
writer.ino = config.Ino
writer.err = nil
writer.bc = config.Bc
writer.mw = config.Mw
writer.ec = config.Ec
writer.ebsc = config.Ebsc
writer.wConcurrency = config.WConcurrency
writer.wg = sync.WaitGroup{}
writer.once = sync.Once{}
writer.RWMutex = sync.RWMutex{}
writer.enableBcache = config.EnableBcache
writer.cacheAction = config.CacheAction
writer.fileCache = config.FileCache
writer.fileSize = config.FileSize
writer.cacheThreshold = config.CacheThreshold
writer.dirty = false
writer.allocateCache()
writer.limitManager = writer.ec.LimitManager
return
}
func (writer *Writer) String() string {
return fmt.Sprintf("Writer{address(%v),volName(%v),volType(%v),ino(%v),blockSize(%v),fileSize(%v),enableBcache(%v),cacheAction(%v),fileCache(%v),cacheThreshold(%v)},wConcurrency(%v)",
&writer, writer.volName, writer.volType, writer.ino, writer.blockSize, writer.fileSize, writer.enableBcache, writer.cacheAction, writer.fileCache, writer.cacheThreshold, writer.wConcurrency)
}
func (writer *Writer) WriteWithoutPool(ctx context.Context, offset int, data []byte) (size int, err error) {
// atomic.StoreInt32(&writer.idle, 0)
if writer == nil {
return 0, fmt.Errorf("writer is not opened yet")
}
log.LogDebugf("TRACE blobStore WriteWithoutPool Enter: ino(%v) offset(%v) len(%v) fileSize(%v)",
writer.ino, offset, len(data), writer.CacheFileSize())
if len(data) > MaxBufferSize || offset != writer.CacheFileSize() {
log.LogErrorf("TRACE blobStore WriteWithoutPool error,may be len(%v)>512MB,offset(%v)!=fileSize(%v)",
len(data), offset, writer.CacheFileSize())
err = syscall.EOPNOTSUPP
return
}
// write buffer
log.LogDebugf("TRACE blobStore WriteWithoutPool: ino(%v) offset(%v) len(%v)",
writer.ino, offset, len(data))
size, err = writer.doBufferWriteWithoutPool(ctx, data, offset)
return
}
func (writer *Writer) Write(ctx context.Context, offset int, data []byte, flags int) (size int, err error) {
// atomic.StoreInt32(&writer.idle, 0)
if writer == nil {
return 0, fmt.Errorf("writer is not opened yet")
}
log.LogDebugf("TRACE blobStore Write Enter: ino(%v) offset(%v) len(%v) flags&proto.FlagsAppend(%v) fileSize(%v)", writer.ino, offset, len(data), flags&proto.FlagsAppend, writer.CacheFileSize())
if len(data) > MaxBufferSize || flags&proto.FlagsAppend == 0 || offset != writer.CacheFileSize() {
log.LogErrorf("TRACE blobStore Write error,may be len(%v)>512MB,flags(%v)!=flagAppend,offset(%v)!=fileSize(%v)", len(data), flags&proto.FlagsAppend, offset, writer.CacheFileSize())
err = syscall.EOPNOTSUPP
return
}
// write buffer
log.LogDebugf("TRACE blobStore Write: ino(%v) offset(%v) len(%v) flags&proto.FlagsSyncWrite(%v)", writer.ino, offset, len(data), flags&proto.FlagsSyncWrite)
if flags&proto.FlagsSyncWrite == 0 {
size, err = writer.doBufferWrite(ctx, data, offset)
return
}
// parallel io write ebs direct
size, err = writer.doParallelWrite(ctx, data, offset)
return
}
func (writer *Writer) doParallelWrite(ctx context.Context, data []byte, offset int) (size int, err error) {
log.LogDebugf("TRACE blobStore doDirectWrite: ino(%v) offset(%v) len(%v)", writer.ino, offset, len(data))
writer.Lock()
defer writer.Unlock()
wSlices := writer.prepareWriteSlice(offset, data)
log.LogDebugf("TRACE blobStore prepareWriteSlice: wSlices(%v)", wSlices)
sliceSize := len(wSlices)
writer.wg.Add(sliceSize)
writer.err = make(chan *wSliceErr, sliceSize)
pool := New(writer.wConcurrency, sliceSize)
defer pool.Close()
for _, wSlice := range wSlices {
pool.Execute(wSlice, func(param *rwSlice) {
writer.writeSlice(ctx, param, true)
})
}
writer.wg.Wait()
for i := 0; i < sliceSize; i++ {
if wErr := <-writer.err; wErr != nil {
log.LogErrorf("slice write error,ino(%v) fileoffset(%v) sliceSize(%v) err(%v)",
writer.ino, wErr.fileOffset, wErr.size, wErr.err)
return 0, wErr.err
}
}
close(writer.err)
// update meta
oeks := make([]proto.ObjExtentKey, 0)
for _, wSlice := range wSlices {
size += int(wSlice.size)
oeks = append(oeks, wSlice.objExtentKey)
}
log.LogDebugf("TRACE blobStore appendObjExtentKeys: oeks(%v)", oeks)
if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
log.LogErrorf("slice write error,meta append ebsc extent keys fail,ino(%v) fileOffset(%v) len(%v) err(%v)", writer.ino, offset, len(data), err)
return
}
atomic.AddUint64(&writer.fileSize, uint64(size))
for _, wSlice := range wSlices {
writer.cacheLevel2(wSlice)
}
return
}
func (writer *Writer) cacheLevel2(wSlice *rwSlice) {
if writer.cacheAction == proto.RWCache && (wSlice.fileOffset+uint64(wSlice.size)) < uint64(writer.cacheThreshold) || writer.fileCache {
buf := make([]byte, wSlice.size)
offSet := int(wSlice.fileOffset)
copy(buf, wSlice.Data)
go writer.asyncCache(writer.ino, offSet, buf)
}
}
func (writer *Writer) WriteFromReader(ctx context.Context, reader io.Reader, h hash.Hash) (size uint64, err error) {
var (
tmp = buf.ReadBufPool.Get().([]byte)
exec = NewExecutor(writer.wConcurrency)
leftToWrite int
)
defer buf.ReadBufPool.Put(tmp)
writer.fileOffset = 0
writer.err = make(chan *wSliceErr)
var oeksLock sync.RWMutex
oeks := make([]proto.ObjExtentKey, 0)
writeBuff := func() {
bufSize := len(writer.buf)
log.LogDebugf("writeBuff: bufSize(%v), leftToWrite(%v), err(%v)", bufSize, leftToWrite, err)
if bufSize == writer.blockSize || (leftToWrite == 0 && err == io.EOF) {
wSlice := &rwSlice{
fileOffset: uint64(writer.fileOffset - bufSize),
size: uint32(bufSize),
}
wSlice.Data = make([]byte, bufSize)
copy(wSlice.Data, writer.buf)
writer.buf = writer.buf[:0]
if (err == nil || err == io.EOF) && h != nil {
h.Write(wSlice.Data)
log.LogDebugf("writeBuff: bufSize(%v), md5", bufSize)
}
writer.wg.Add(1)
write := func() {
defer writer.wg.Done()
err := writer.writeSlice(ctx, wSlice, false)
if err != nil {
writer.Lock()
if len(writer.err) > 0 {
writer.Unlock()
return
}
wErr := &wSliceErr{
err: err,
fileOffset: wSlice.fileOffset,
size: wSlice.size,
}
writer.err <- wErr
writer.Unlock()
return
}
oeksLock.Lock()
oeks = append(oeks, wSlice.objExtentKey)
oeksLock.Unlock()
writer.cacheLevel2(wSlice)
}
exec.Run(write)
}
}
LOOP:
for {
position := 0
leftToWrite, err = reader.Read(tmp)
if err != nil && err != io.EOF {
return
}
for leftToWrite > 0 {
log.LogDebugf("WriteFromReader: leftToWrite(%v), err(%v)", leftToWrite, err)
writer.RLock()
errNum := len(writer.err)
writer.RUnlock()
if errNum > 0 {
break LOOP
}
freeSize := writer.blockSize - len(writer.buf)
writeSize := util.Min(leftToWrite, freeSize)
writer.buf = append(writer.buf, tmp[position:position+writeSize]...)
position += writeSize
leftToWrite -= writeSize
writer.fileOffset += writeSize
writer.dirty = true
writeBuff()
}
if err == io.EOF {
log.LogDebugf("WriteFromReader: EOF")
if len(writer.buf) > 0 {
writeBuff()
}
err = nil
writer.wg.Wait()
var wErr *wSliceErr
select {
case wErr := <-writer.err:
err = wErr.err
default:
}
if err != nil {
log.LogErrorf("slice write error,ino(%v) fileoffset(%v) sliceSize(%v) err(%v)", writer.ino, wErr.fileOffset, wErr.size, err)
}
break
}
}
log.LogDebugf("WriteFromReader before sort: %v", oeks)
sort.Slice(oeks, func(i, j int) bool {
return oeks[i].FileOffset < oeks[j].FileOffset
})
log.LogDebugf("WriteFromReader after sort: %v", oeks)
if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
log.LogErrorf("WriteFromReader error,meta append ebsc extent keys fail,ino(%v), err(%v)", writer.ino, err)
return
}
size = uint64(writer.fileOffset)
atomic.AddUint64(&writer.fileSize, size)
return
}
func (writer *Writer) doBufferWriteWithoutPool(ctx context.Context, data []byte, offset int) (size int, err error) {
log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool Enter: ino(%v) offset(%v) len(%v)", writer.ino, offset, len(data))
writer.fileOffset = offset
dataSize := len(data)
position := 0
log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
writer.Lock()
defer writer.Unlock()
for dataSize > 0 {
freeSize := writer.blockSize - len(writer.buf)
if dataSize < freeSize {
freeSize = dataSize
}
log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool: ino(%v) writer.fileSize(%v) writer.fileOffset(%v) position(%v) freeSize(%v)", writer.ino, writer.fileSize, writer.fileOffset, position, freeSize)
writer.buf = append(writer.buf, data[position:position+freeSize]...)
log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool:ino(%v) writer.buf.len(%v)", writer.ino, len(writer.buf))
position += freeSize
dataSize -= freeSize
writer.fileOffset += freeSize
writer.dirty = true
if len(writer.buf) == writer.blockSize {
log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
writer.Unlock()
err = writer.flushWithoutPool(writer.ino, ctx, false)
writer.Lock()
if err != nil {
writer.buf = writer.buf[:len(writer.buf)-len(data)]
writer.fileOffset -= len(data)
return
}
}
}
size = len(data)
atomic.AddUint64(&writer.fileSize, uint64(size))
log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool Exit: ino(%v) writer.fileSize(%v) writer.fileOffset(%v)", writer.ino, writer.fileSize, writer.fileOffset)
return size, nil
}
func (writer *Writer) doBufferWrite(ctx context.Context, data []byte, offset int) (size int, err error) {
log.LogDebugf("TRACE blobStore doBufferWrite Enter: ino(%v) offset(%v) len(%v)", writer.ino, offset, len(data))
writer.fileOffset = offset
dataSize := len(data)
position := 0
log.LogDebugf("TRACE blobStore doBufferWrite: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
writer.Lock()
defer writer.Unlock()
for dataSize > 0 {
freeSize := writer.blockSize - writer.blockPosition
if dataSize < freeSize {
freeSize = dataSize
}
log.LogDebugf("TRACE blobStore doBufferWrite: ino(%v) writer.fileSize(%v) writer.fileOffset(%v) writer.blockPosition(%v) position(%v) freeSize(%v)", writer.ino, writer.fileSize, writer.fileOffset, writer.blockPosition, position, freeSize)
copy(writer.buf[writer.blockPosition:], data[position:position+freeSize])
log.LogDebugf("TRACE blobStore doBufferWrite:ino(%v) writer.buf.len(%v)", writer.ino, len(writer.buf))
position += freeSize
writer.blockPosition += freeSize
dataSize -= freeSize
writer.fileOffset += freeSize
writer.dirty = true
if writer.blockPosition == writer.blockSize {
log.LogDebugf("TRACE blobStore doBufferWrite: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
writer.Unlock()
err = writer.flush(writer.ino, ctx, false)
writer.Lock()
if err != nil {
writer.buf = writer.buf[:writer.blockPosition-freeSize]
writer.fileOffset -= freeSize
writer.blockPosition -= freeSize
return
}
}
}
size = len(data)
atomic.AddUint64(&writer.fileSize, uint64(size))
log.LogDebugf("TRACE blobStore doBufferWrite Exit: ino(%v) writer.fileSize(%v) writer.fileOffset(%v)", writer.ino, writer.fileSize, writer.fileOffset)
return size, nil
}
func (writer *Writer) FlushWithoutPool(ino uint64, ctx context.Context) (err error) {
if writer == nil {
return
}
return writer.flushWithoutPool(ino, ctx, true)
}
func (writer *Writer) Flush(ino uint64, ctx context.Context) (err error) {
if writer == nil {
return
}
return writer.flush(ino, ctx, true)
}
func (writer *Writer) shouldCacheCfs() bool {
return writer.cacheAction == proto.RWCache
}
func (writer *Writer) prepareWriteSlice(offset int, data []byte) []*rwSlice {
size := len(data)
wSlices := make([]*rwSlice, 0)
wSliceCount := size / writer.blockSize
remainSize := size % writer.blockSize
for index := 0; index < wSliceCount; index++ {
offset := offset + index*writer.blockSize
wSlice := &rwSlice{
index: index,
fileOffset: uint64(offset),
size: uint32(writer.blockSize),
Data: data[index*writer.blockSize : (index+1)*writer.blockSize],
}
wSlices = append(wSlices, wSlice)
}
offset = offset + wSliceCount*writer.blockSize
if remainSize > 0 {
wSlice := &rwSlice{
index: wSliceCount,
fileOffset: uint64(offset),
size: uint32(remainSize),
Data: data[wSliceCount*writer.blockSize:],
}
wSlices = append(wSlices, wSlice)
}
return wSlices
}
func (writer *Writer) writeSlice(ctx context.Context, wSlice *rwSlice, wg bool) (err error) {
if wg {
defer writer.wg.Done()
}
writer.limitManager.WriteAlloc(ctx, int(wSlice.size))
log.LogDebugf("TRACE blobStore,writeSlice to ebs. ino(%v) fileOffset(%v) len(%v)", writer.ino, wSlice.fileOffset, wSlice.size)
location, err := writer.ebsc.Write(ctx, writer.volName, wSlice.Data, wSlice.size)
if err != nil {
if wg {
writer.err <- &wSliceErr{err: err, fileOffset: wSlice.fileOffset, size: wSlice.size}
}
return err
}
log.LogDebugf("TRACE blobStore,location(%v)", location)
blobs := make([]proto.Blob, 0)
for _, info := range location.Blobs {
blob := proto.Blob{
MinBid: uint64(info.MinBid),
Count: uint64(info.Count),
Vid: uint64(info.Vid),
}
blobs = append(blobs, blob)
}
wSlice.objExtentKey = proto.ObjExtentKey{
Cid: uint64(location.ClusterID),
CodeMode: uint8(location.CodeMode),
Size: location.Size,
BlobSize: location.BlobSize,
Blobs: blobs,
BlobsLen: uint32(len(blobs)),
FileOffset: wSlice.fileOffset,
Crc: location.Crc,
}
log.LogDebugf("TRACE blobStore,objExtentKey(%v)", wSlice.objExtentKey)
if wg {
writer.err <- nil
}
return
}
func (writer *Writer) asyncCache(ino uint64, offset int, data []byte) {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("write-async-cache", err, bgTime, 1)
}()
log.LogDebugf("TRACE asyncCache Enter,fileOffset(%v) len(%v)", offset, len(data))
write, err := writer.ec.Write(ino, offset, data, proto.FlagsCache, nil)
log.LogDebugf("TRACE asyncCache Exit,write(%v) err(%v)", write, err)
}
func (writer *Writer) resetBufferWithoutPool() {
writer.buf = writer.buf[:0]
}
func (writer *Writer) resetBuffer() {
// writer.buf = writer.buf[:0]
writer.blockPosition = 0
}
func (writer *Writer) flushWithoutPool(inode uint64, ctx context.Context, flushFlag bool) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("blobstore-flush", err, bgTime, 1)
}()
log.LogDebugf("TRACE blobStore flushWithoutPool: ino(%v) buf-len(%v) flushFlag(%v)", inode, len(writer.buf), flushFlag)
writer.Lock()
defer func() {
writer.dirty = false
writer.Unlock()
}()
if len(writer.buf) == 0 || !writer.dirty {
return
}
bufferSize := len(writer.buf)
wSlice := &rwSlice{
fileOffset: uint64(writer.fileOffset - bufferSize),
size: uint32(bufferSize),
Data: writer.buf,
}
err = writer.writeSlice(ctx, wSlice, false)
if err != nil {
if flushFlag {
atomic.AddUint64(&writer.fileSize, -uint64(bufferSize))
}
return
}
oeks := make([]proto.ObjExtentKey, 0)
// update meta
oeks = append(oeks, wSlice.objExtentKey)
if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
log.LogErrorf("slice write error,meta append ebsc extent keys fail,ino(%v) fileOffset(%v) len(%v) err(%v)", inode, wSlice.fileOffset, wSlice.size, err)
return
}
writer.resetBufferWithoutPool()
writer.cacheLevel2(wSlice)
return
}
func (writer *Writer) flush(inode uint64, ctx context.Context, flushFlag bool) (err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("blobstore-flush", err, bgTime, 1)
}()
log.LogDebugf("TRACE blobStore flush: ino(%v) buf-len(%v) flushFlag(%v)", inode, len(writer.buf), flushFlag)
writer.Lock()
defer func() {
writer.dirty = false
writer.Unlock()
}()
if len(writer.buf) == 0 || !writer.dirty {
return
}
bufferSize := writer.blockPosition
wSlice := &rwSlice{
fileOffset: uint64(writer.fileOffset - bufferSize),
size: uint32(bufferSize),
Data: writer.buf,
}
err = writer.writeSlice(ctx, wSlice, false)
if err != nil {
if flushFlag {
atomic.AddUint64(&writer.fileSize, -uint64(bufferSize))
}
return
}
oeks := make([]proto.ObjExtentKey, 0)
// update meta
oeks = append(oeks, wSlice.objExtentKey)
if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
log.LogErrorf("slice write error,meta append ebsc extent keys fail,ino(%v) fileOffset(%v) len(%v) err(%v)", inode, wSlice.fileOffset, wSlice.size, err)
return
}
writer.resetBuffer()
writer.cacheLevel2(wSlice)
return
}
func (writer *Writer) CacheFileSize() int {
return int(atomic.LoadUint64(&writer.fileSize))
}
func (writer *Writer) FreeCache() {
if writer == nil {
return
}
if buf.CachePool == nil {
return
}
writer.once.Do(func() {
tmpBuf := writer.buf
writer.buf = nil
if tmpBuf != nil {
buf.CachePool.Put(tmpBuf)
}
})
}
func (writer *Writer) allocateCache() {
if buf.CachePool == nil {
return
}
writer.buf = buf.CachePool.Get()
}
package manager
import (
"container/list"
"context"
"math"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
const (
runNow = 1
runLater = 2
gridHitLimitCnt = 1
girdCntOneSecond = 3
gridWindowTimeScope = 10
qosExpireTime = 20
qosReportMinGap = uint32(time.Second) / 2
defaultMagnifyFactor = 100
)
type UploadFlowInfoFunc func(clientInfo wrapper.SimpleClientInfo) error
type GridElement struct {
time time.Time
used uint64
limit uint64
buffer uint64
hitLimit bool
ID uint64
sync.RWMutex
}
type AllocElement struct {
used uint32
magnify uint32
future *util.Future
}
type LimitFactor struct {
factorType uint32
gridList *list.List
waitList *list.List
gidHitLimitCnt uint8
mgr *LimitManager
gridId uint64
magnify uint32
winBuffer uint64
lock sync.RWMutex
valAllocApply uint64
valAllocCommit uint64
valAllocLastApply uint64
valAllocLastCommit uint64
isSetLimitZero bool
}
func (factor *LimitFactor) getNeedByMagnify(allocCnt uint32, magnify uint32) uint64 {
if magnify == 0 {
return 0
}
if allocCnt > 1000 {
log.QosWriteDebugf("action[getNeedByMagnify] allocCnt %v", allocCnt)
magnify = defaultMagnifyFactor
}
need := uint64(allocCnt * magnify)
if factor.factorType == proto.FlowWriteType || factor.factorType == proto.FlowReadType {
if need > util.GB/8 {
need = util.GB / 8
}
}
return need
}
func (factor *LimitFactor) alloc(allocCnt uint32) (ret uint8, future *util.Future) {
log.QosWriteDebugf("action[alloc] type [%v] alloc [%v], tmp factor waitlist [%v] hitlimtcnt [%v] len [%v]", proto.QosTypeString(factor.factorType),
allocCnt, factor.waitList.Len(), factor.gidHitLimitCnt, factor.gridList.Len())
atomic.AddUint64(&factor.valAllocApply, uint64(allocCnt))
if !factor.mgr.enable {
// used not accurate also fine, the purpose is get master's info
// without lock can better performance just the used value large than 0
gridEnd := factor.gridList.Back()
if gridEnd != nil {
grid := gridEnd.Value.(*GridElement)
// grid.used = grid.used+uint64(allocCnt)
atomic.AddUint64(&grid.used, uint64(allocCnt))
// atomic.CompareAndSwapUint64(&factor.valAllocApply, factor.valAllocApply, factor.valAllocApply+uint64(allocCnt))
}
return runNow, nil
}
type activeSt struct {
activeUpdate bool
needWait bool
}
activeState := &activeSt{}
defer func(active *activeSt) {
if !active.needWait {
factor.lock.RUnlock()
} else if !active.activeUpdate {
factor.lock.Unlock()
}
}(activeState)
factor.lock.RLock()
grid := factor.gridList.Back().Value.(*GridElement)
if factor.mgr.enable && (factor.waitList.Len() > 0 || atomic.LoadUint64(&grid.used)+uint64(allocCnt) > grid.limit+grid.buffer) {
factor.lock.RUnlock()
factor.lock.Lock()
activeState.needWait = true
future = util.NewFuture()
factor.waitList.PushBack(&AllocElement{
used: allocCnt,
future: future,
magnify: factor.magnify,
})
if !grid.hitLimit {
factor.gidHitLimitCnt++
// 1s have several gird, gidHitLimitCnt is the count that gird count hit limit in latest 1s,
// if gidHitLimitCnt larger than limit then request for enlarge factor limit
// GetSimpleVolView will call back simpleClient function to get factor info and send to master
if factor.gidHitLimitCnt >= factor.mgr.HitTriggerCnt {
tmpTime := time.Now()
if factor.mgr.lastReqTime.Add(time.Duration(factor.mgr.ReqPeriod) * time.Second).Before(tmpTime) {
factor.mgr.lastReqTime = tmpTime
log.QosWriteDebugf("CheckGrid factor [%v] unlock before active update simple vol view,gird id[%v] limit[%v] buffer [%v] used [%v]",
proto.QosTypeString(factor.factorType), grid.ID, grid.limit, grid.buffer, grid.used)
// unlock need call here,UpdateSimpleVolView will lock again
grid.hitLimit = true
factor.lock.Unlock()
activeState.activeUpdate = true
go factor.mgr.WrapperUpdate(factor.mgr.simpleClient)
}
}
}
grid.hitLimit = true
return runLater, future
}
atomic.AddUint64(&grid.used, uint64(allocCnt))
// atomic.CompareAndSwapUint64(&grid.used, grid.used, grid.used+uint64(allocCnt))
return runNow, future
}
func (factor *LimitFactor) SetLimit(limitVal uint64, bufferVal uint64) {
log.QosWriteDebugf("action[SetLimit] factor type [%v] limitVal [%v] bufferVal [%v]", proto.QosTypeString(factor.factorType), limitVal, bufferVal)
var grid *GridElement
factor.mgr.lastTimeOfSetLimit = time.Now()
factor.lock.Lock()
defer func() {
factor.TryReleaseWaitList()
factor.lock.Unlock()
}()
if factor.gridList.Len() == 0 {
grid = &GridElement{
time: time.Now(),
limit: limitVal / girdCntOneSecond,
buffer: bufferVal / girdCntOneSecond,
ID: factor.gridId,
}
factor.gridId++
factor.gridList.PushBack(grid)
} else {
grid = factor.gridList.Back().Value.(*GridElement)
grid.buffer = bufferVal / girdCntOneSecond
grid.limit = limitVal / girdCntOneSecond
}
if grid.limit == 0 {
factor.isSetLimitZero = true
switch factor.factorType {
case proto.IopsReadType, proto.IopsWriteType:
grid.limit = proto.MinIopsLimit / girdCntOneSecond
if grid.limit == 0 {
grid.limit = 1
}
case proto.FlowReadType, proto.FlowWriteType:
grid.limit = proto.MinFLowLimit / girdCntOneSecond
if grid.limit == 0 {
grid.limit = 10 * util.KB
}
default:
// do nothing
}
} else {
factor.isSetLimitZero = false
}
grid = factor.gridList.Back().Value.(*GridElement)
log.QosWriteDebugf("action[SetLimit] factor type [%v] gird id %v limit %v buffer %v",
proto.QosTypeString(factor.factorType), grid.ID, grid.limit, grid.buffer)
}
// clean wait list if limit be enlrarged by master
// no lock need for parallel,caller own the lock and will release it
func (factor *LimitFactor) TryReleaseWaitList() {
gridIter := factor.gridList.Back()
tGrid := gridIter.Value.(*GridElement)
cnt := 0
for factor.waitList.Len() > 0 {
value := factor.waitList.Front()
ele := value.Value.(*AllocElement)
// log.LogDebugf("action[TryReleaseWaitList] type [%v] ele used [%v]", proto.QosTypeString(factor.factorType), ele.used)
for atomic.LoadUint64(&tGrid.used)+uint64(ele.used) > tGrid.limit+tGrid.buffer {
log.LogWarnf("action[TryReleaseWaitList] type [%v] new gird be used up.alloc in waitlist left cnt [%v],"+
"grid be allocated [%v] grid limit [%v] and buffer[%v], gird id:[%v], use pregrid size[%v]",
proto.QosTypeString(factor.factorType), factor.waitList.Len(), tGrid.used, tGrid.limit, tGrid.buffer,
tGrid.ID, uint32(tGrid.limit+tGrid.buffer-tGrid.used))
tUsed := atomic.LoadUint64(&tGrid.used)
val := tGrid.limit + tGrid.buffer - tUsed // uint may out range
if tGrid.limit+tGrid.buffer > tUsed && ele.used >= uint32(val) { // not atomic pretect,grid used may larger than limit and buffer
ele.used -= uint32(val)
log.QosWriteDebugf("action[TryReleaseWaitList] type [%v] ele used reduce [%v] and left [%v]", proto.QosTypeString(factor.factorType), val, ele.used)
// atomic.AddUint64(&curGrid.used, tGrid.limit+ tGrid.buffer)
atomic.AddUint64(&tGrid.used, val)
}
cnt++
if gridIter.Prev() == nil || cnt >= girdCntOneSecond {
return
}
gridIter = gridIter.Prev()
tGrid = gridIter.Value.(*GridElement)
}
atomic.AddUint64(&tGrid.used, uint64(ele.used))
log.QosWriteDebugf("action[TryReleaseWaitList] type [%v] ele used [%v] consumed!", proto.QosTypeString(factor.factorType), ele.used)
ele.future.Respond(true, nil)
value = value.Next()
factor.waitList.Remove(factor.waitList.Front())
}
}
func (factor *LimitFactor) CheckGrid() {
defer func() {
factor.lock.Unlock()
}()
factor.lock.Lock()
grid := factor.gridList.Back().Value.(*GridElement)
newGrid := &GridElement{
time: time.Now(),
limit: grid.limit,
used: 0,
buffer: grid.buffer,
ID: factor.gridId,
}
factor.gridId++
if factor.mgr.enable && factor.mgr.lastTimeOfSetLimit.Add(time.Second*qosExpireTime).Before(newGrid.time) {
log.LogWarnf("action[CheckGrid]. qos recv no command from master in long time, last time %v, grid time %v",
factor.mgr.lastTimeOfSetLimit, newGrid.time)
}
if factor.mgr.enable {
log.QosWriteDebugf("action[CheckGrid] factor type:[%v] gridlistLen:[%v] waitlistLen:[%v] hitlimitcnt:[%v] "+
"add new grid info girdid[%v] used:[%v] limit:[%v] buffer:[%v] time:[%v]",
proto.QosTypeString(factor.factorType), factor.gridList.Len(), factor.waitList.Len(), factor.gidHitLimitCnt,
newGrid.ID, newGrid.used, newGrid.limit, newGrid.buffer, newGrid.time)
}
factor.gridList.PushBack(newGrid)
for factor.gridList.Len() > gridWindowTimeScope*girdCntOneSecond {
firstGrid := factor.gridList.Front().Value.(*GridElement)
if firstGrid.hitLimit {
factor.gidHitLimitCnt--
if factor.mgr.enable {
log.QosWriteDebugf("action[CheckGrid] factor [%v] after minus gidHitLimitCnt:[%v]",
proto.QosTypeString(factor.factorType), factor.gidHitLimitCnt)
}
}
if factor.mgr.enable {
log.QosWriteDebugf("action[CheckGrid] type:[%v] remove oldest grid id[%v] info buffer:[%v] limit:[%v] used[%v] from gridlist",
proto.QosTypeString(factor.factorType), firstGrid.ID, firstGrid.buffer, firstGrid.limit, firstGrid.used)
}
factor.gridList.Remove(factor.gridList.Front())
}
factor.TryReleaseWaitList()
}
func newLimitFactor(mgr *LimitManager, factorType uint32) *LimitFactor {
limit := &LimitFactor{
mgr: mgr,
factorType: factorType,
waitList: list.New(),
gridList: list.New(),
magnify: defaultMagnifyFactor,
}
limit.SetLimit(0, 0)
return limit
}
type LimitManager struct {
ID uint64
limitMap map[uint32]*LimitFactor
enable bool
simpleClient wrapper.SimpleClientInfo
exitCh chan struct{}
WrapperUpdate UploadFlowInfoFunc
ReqPeriod uint32
HitTriggerCnt uint8
lastReqTime time.Time
lastTimeOfSetLimit time.Time
isLastReqValid bool
once sync.Once
}
func NewLimitManager(client wrapper.SimpleClientInfo) *LimitManager {
mgr := &LimitManager{
limitMap: make(map[uint32]*LimitFactor, 0),
enable: false, // assign from master
simpleClient: client,
HitTriggerCnt: gridHitLimitCnt,
ReqPeriod: 1,
}
mgr.limitMap[proto.IopsReadType] = newLimitFactor(mgr, proto.IopsReadType)
mgr.limitMap[proto.IopsWriteType] = newLimitFactor(mgr, proto.IopsWriteType)
mgr.limitMap[proto.FlowWriteType] = newLimitFactor(mgr, proto.FlowWriteType)
mgr.limitMap[proto.FlowReadType] = newLimitFactor(mgr, proto.FlowReadType)
mgr.ScheduleCheckGrid()
return mgr
}
func (factor *LimitFactor) GetWaitTotalSize() (waitSize uint64) {
value := factor.waitList.Front()
for {
if value == nil {
break
}
ele := value.Value.(*AllocElement)
waitSize += uint64(ele.used)
value = value.Next()
}
return
}
func (limitManager *LimitManager) CalcNeedByPow(limitFactor *LimitFactor, used uint64) (need uint64) {
if limitFactor.waitList.Len() == 0 {
return 0
}
if limitFactor.factorType == proto.FlowReadType || limitFactor.factorType == proto.FlowWriteType {
used += limitFactor.GetWaitTotalSize()
if used < 128*util.KB {
need = 128 * util.KB
return
}
need = uint64(300 * util.MB * math.Pow(float64(used)/float64(300*util.MB), 0.8))
} else {
if used == 0 {
used = uint64(limitFactor.waitList.Len())
}
need = uint64(300 * math.Pow(float64(used)/float64(300), 0.8))
}
return
}
func (limitManager *LimitManager) GetFlowInfo() (*proto.ClientReportLimitInfo, bool) {
info := &proto.ClientReportLimitInfo{
FactorMap: make(map[uint32]*proto.ClientLimitInfo, 0),
}
var (
validCliInfo bool
griCnt int
limit uint64
buffer uint64
)
for factorType, limitFactor := range limitManager.limitMap {
limitFactor.lock.RLock()
var reqUsed uint64
griCnt = 0
grid := limitFactor.gridList.Back()
grid = grid.Prev()
// reqUsed := limitFactor.valAllocLastCommit
for griCnt < limitFactor.gridList.Len()-1 {
reqUsed += atomic.LoadUint64(&grid.Value.(*GridElement).used)
limit += grid.Value.(*GridElement).limit
buffer += grid.Value.(*GridElement).buffer
griCnt++
// log.LogDebugf("action[GetFlowInfo] type [%v] grid id[%v] used %v limit %v buffer %v time %v sum_used %v sum_limit %v,len %v",
// proto.QosTypeString(factorType),
// grid.Value.(*GridElement).ID,
// grid.Value.(*GridElement).used,
// grid.Value.(*GridElement).limit,
// grid.Value.(*GridElement).buffer,
// grid.Value.(*GridElement).time,
// reqUsed,
// limit, limitFactor.gridList.Len())
if grid.Prev() == nil || griCnt >= girdCntOneSecond {
log.QosWriteDebugf("action[[GetFlowInfo] type [%v] grid count %v reqused %v list len %v",
proto.QosTypeString(factorType), griCnt, reqUsed, limitFactor.gridList.Len())
break
}
grid = grid.Prev()
}
if griCnt > 0 {
timeElapse := uint64(time.Second) * uint64(griCnt) / girdCntOneSecond
if timeElapse < uint64(qosReportMinGap) {
log.LogWarnf("action[GetFlowInfo] type [%v] timeElapse [%v] since last report",
proto.QosTypeString(limitFactor.factorType), timeElapse)
timeElapse = uint64(qosReportMinGap) // time of interval get vol view from master todo:change to config time
}
reqUsed = uint64(float64(reqUsed) / (float64(timeElapse) / float64(time.Second)))
}
factor := &proto.ClientLimitInfo{
Used: reqUsed,
Need: limitManager.CalcNeedByPow(limitFactor, reqUsed),
UsedLimit: limitFactor.gridList.Back().Value.(*GridElement).limit * girdCntOneSecond,
UsedBuffer: limitFactor.gridList.Back().Value.(*GridElement).buffer * girdCntOneSecond,
}
limitFactor.lock.RUnlock()
info.FactorMap[factorType] = factor
info.Host = wrapper.LocalIP
info.Status = proto.QosStateNormal
info.ID = limitManager.ID
if limitFactor.waitList.Len() > 0 ||
!limitFactor.isSetLimitZero ||
factor.Used|factor.Need > 0 {
log.QosWriteDebugf("action[GetFlowInfo] type [%v] len [%v] isSetLimitZero [%v] used [%v] need [%v]", proto.QosTypeString(limitFactor.factorType),
limitFactor.waitList.Len(), limitFactor.isSetLimitZero, factor.Used, factor.Need)
validCliInfo = true
}
if griCnt > 0 {
log.QosWriteDebugf("action[GetFlowInfo] type [%v] last commit[%v] report to master "+
"with simpleClient limit info [%v,%v,%v,%v],host [%v], "+
"status [%v] grid [%v, %v, %v]",
proto.QosTypeString(limitFactor.factorType), limitFactor.valAllocLastCommit,
factor.Used, factor.Need, factor.UsedBuffer, factor.UsedLimit, info.Host,
info.Status, grid.Value.(*GridElement).ID, grid.Value.(*GridElement).limit, grid.Value.(*GridElement).buffer)
}
}
lastValid := limitManager.isLastReqValid
limitManager.isLastReqValid = validCliInfo
limitManager.once.Do(func() {
validCliInfo = true
})
// client has no user request then don't report to master
if !lastValid && !validCliInfo {
return info, false
}
return info, true
}
func (limitManager *LimitManager) ScheduleCheckGrid() {
go func() {
ticker := time.NewTicker(1000 / girdCntOneSecond * time.Millisecond)
defer func() {
ticker.Stop()
}()
var cnt uint64
for {
select {
case <-limitManager.exitCh:
return
case <-ticker.C:
cnt++
for factorType, limitFactor := range limitManager.limitMap {
limitFactor.CheckGrid()
if cnt%girdCntOneSecond == 0 {
log.QosWriteDebugf("action[ScheduleCheckGrid] type [%v] factor apply val:[%v] commit val:[%v]",
proto.QosTypeString(factorType), atomic.LoadUint64(&limitFactor.valAllocApply), atomic.LoadUint64(&limitFactor.valAllocCommit))
limitFactor.valAllocLastApply = atomic.LoadUint64(&limitFactor.valAllocLastApply)
limitFactor.valAllocLastCommit = atomic.LoadUint64(&limitFactor.valAllocCommit)
atomic.StoreUint64(&limitFactor.valAllocApply, 0)
atomic.StoreUint64(&limitFactor.valAllocCommit, 0)
}
}
}
}
}()
}
func (limitManager *LimitManager) SetClientLimit(limit *proto.LimitRsp2Client) {
if limit == nil {
log.LogErrorf("action[SetClientLimit] limit info is nil")
return
}
if limitManager.enable != limit.Enable {
log.LogWarnf("action[SetClientLimit] enable [%v]", limit.Enable)
}
limitManager.enable = limit.Enable
if limit.HitTriggerCnt > 0 {
log.LogWarnf("action[SetClientLimit] update to HitTriggerCnt [%v] from [%v]", limitManager.HitTriggerCnt, limit.HitTriggerCnt)
limitManager.HitTriggerCnt = limit.HitTriggerCnt
}
if limit.ReqPeriod > 0 {
log.LogWarnf("action[SetClientLimit] update to ReqPeriod [%v] from [%v]", limitManager.ReqPeriod, limit.ReqPeriod)
limitManager.ReqPeriod = limit.ReqPeriod
}
for factorType, clientLimitInfo := range limit.FactorMap {
limitManager.limitMap[factorType].SetLimit(clientLimitInfo.UsedLimit, clientLimitInfo.UsedBuffer)
}
for factorType, magnify := range limit.Magnify {
if magnify > 0 && magnify != limitManager.limitMap[factorType].magnify {
log.QosWriteDebugf("action[SetClientLimit] type [%v] update magnify [%v] to [%v]",
proto.QosTypeString(factorType), limitManager.limitMap[factorType].magnify, magnify)
limitManager.limitMap[factorType].magnify = magnify
}
}
}
func (limitManager *LimitManager) ReadAlloc(ctx context.Context, size int) {
limitManager.WaitN(ctx, limitManager.limitMap[proto.IopsReadType], 1)
limitManager.WaitN(ctx, limitManager.limitMap[proto.FlowReadType], size)
}
func (limitManager *LimitManager) WriteAlloc(ctx context.Context, size int) {
limitManager.WaitN(ctx, limitManager.limitMap[proto.IopsWriteType], 1)
limitManager.WaitN(ctx, limitManager.limitMap[proto.FlowWriteType], size)
}
// WaitN blocks until alloc success
func (limitManager *LimitManager) WaitN(ctx context.Context, lim *LimitFactor, n int) (err error) {
var fut *util.Future
var ret uint8
if ret, fut = lim.alloc(uint32(n)); ret == runNow {
atomic.AddUint64(&lim.valAllocCommit, uint64(n))
log.QosWriteDebugf("action[WaitN] type [%v] return now waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
return nil
}
respCh, errCh := fut.AsyncResponse()
select {
case <-ctx.Done():
log.LogWarnf("action[WaitN] type [%v] ctx done return waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
return ctx.Err()
case err = <-errCh:
log.LogWarnf("action[WaitN] type [%v] err return waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
return
case <-respCh:
atomic.AddUint64(&lim.valAllocCommit, uint64(n))
log.QosWriteDebugf("action[WaitN] type [%v] return waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
return nil
// default:
}
}
func (limitManager *LimitManager) UpdateFlowInfo(limit *proto.LimitRsp2Client) {
limitManager.SetClientLimit(limit)
}
func (limitManager *LimitManager) SetClientID(id uint64) (err error) {
limitManager.ID = id
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"fmt"
"sync"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/log"
)
// ExtentRequest defines the struct for the request of read or write an extent.
type ExtentRequest struct {
FileOffset int
Size int
Data []byte
ExtentKey *proto.ExtentKey
}
// String returns the string format of the extent request.
func (er *ExtentRequest) String() string {
return fmt.Sprintf("FileOffset(%v) Size(%v) ExtentKey(%v)", er.FileOffset, er.Size, er.ExtentKey)
}
// NewExtentRequest returns a new extent request.
func NewExtentRequest(offset, size int, data []byte, ek *proto.ExtentKey) *ExtentRequest {
return &ExtentRequest{
FileOffset: offset,
Size: size,
Data: data,
ExtentKey: ek,
}
}
// ExtentCache defines the struct of the extent cache.
type ExtentCache struct {
sync.RWMutex
inode uint64
gen uint64 // generation number
size uint64 // size of the cache
root *btree.BTree
discard *btree.BTree
verSeq uint64
}
// NewExtentCache returns a new extent cache.
func NewExtentCache(inode uint64) *ExtentCache {
return &ExtentCache{
inode: inode,
root: btree.NewWithSize(8, 4),
discard: btree.NewWithSize(8, 4),
}
}
func (cache *ExtentCache) LogOutPut() {
cache.root.Ascend(func(bi btree.Item) bool {
ek := bi.(*proto.ExtentKey)
log.LogDebugf("ExtentCache update: local ino(%v) ek(%v)", cache.inode, ek)
return true
})
}
func (cache *ExtentCache) RefreshForce(inode uint64, getExtents GetExtentsFunc) error {
gen, size, extents, err := getExtents(inode)
if err != nil {
return err
}
// log.LogDebugf("Local ExtentCache before update: ino(%v) gen(%v) size(%v) extents(%v)", inode, cache.gen, cache.size, cache.List())
cache.update(gen, size, true, extents)
log.LogDebugf("Local ExtentCache after update: ino(%v) gen(%v) size(%v) extents(%v)", inode, cache.gen, cache.size, cache.List())
return nil
}
// Refresh refreshes the extent cache.
func (cache *ExtentCache) Refresh(inode uint64, getExtents GetExtentsFunc) error {
if cache.root.Len() > 0 {
return nil
}
gen, size, extents, err := getExtents(inode)
if err != nil {
return err
}
// log.LogDebugf("Local ExtentCache before update: ino(%v) gen(%v) size(%v) extents(%v)", inode, cache.gen, cache.size, cache.List())
cache.update(gen, size, false, extents)
log.LogDebugf("Local ExtentCache after update: ino(%v) gen(%v) size(%v)", inode, cache.gen, cache.size)
return nil
}
func (cache *ExtentCache) update(gen, size uint64, force bool, eks []proto.ExtentKey) {
cache.Lock()
defer cache.Unlock()
log.LogDebugf("ExtentCache update: ino(%v) cache.gen(%v) cache.size(%v) gen(%v) size(%v)", cache.inode, cache.gen, cache.size, gen, size)
if !force && cache.gen != 0 && cache.gen >= gen {
log.LogDebugf("ExtentCache update: no need to update, ino(%v) gen(%v) size(%v)", cache.inode, gen, size)
return
}
cache.gen = gen
cache.size = size
cache.root.Clear(false)
for _, ek := range eks {
extent := ek
log.LogDebugf("action[update] update cache replace or insert ek [%v]", ek.String())
cache.root.ReplaceOrInsert(&extent)
}
}
// Split extent key.
func (cache *ExtentCache) SplitExtentKey(inodeID uint64, ekPivot *proto.ExtentKey) (err error) {
cache.Lock()
defer cache.Unlock()
// log.LogDebugf("before cache output")
// cache.LogOutPut()
// When doing the append, we do not care about the data after the file offset.
// Those data will be overwritten by the current extent anyway.
var ekFind *proto.ExtentKey
var ekLeft *proto.ExtentKey
var ekRight *proto.ExtentKey
cache.root.DescendLessOrEqual(ekPivot, func(i btree.Item) bool {
if ekFind == nil {
ekFind = i.(*proto.ExtentKey)
log.LogDebugf("action[ExtentCache.SplitExtentKey] inode %v ek [%v]", inodeID, ekFind)
return true
}
ekLeft = i.(*proto.ExtentKey)
log.LogDebugf("action[ExtentCache.SplitExtentKey] inode %v ekLeft [%v]", inodeID, ekLeft)
return false
})
cache.root.AscendGreaterThan(ekPivot, func(i btree.Item) bool {
ekRight = i.(*proto.ExtentKey)
log.LogDebugf("action[ExtentCache.SplitExtentKey] inode %v ekRight [%v]", inodeID, ekRight)
return false
})
if ekFind == nil {
err = fmt.Errorf("inode %v not found ek fileOff[%v] seq[%v]", inodeID, ekPivot.FileOffset, ekPivot.GetSeq())
return
}
ek := &proto.ExtentKey{}
*ek = *ekFind
cache.root.Delete(ekFind)
if nil != cache.root.Get(ekFind) {
log.LogDebugf("ExtentCache Delete: ino(%v) ek(%v) ", cache.inode, ekFind)
panic(nil)
}
log.LogDebugf("ExtentCache Delete: ino(%v) ek(%v) ", cache.inode, ekFind)
ek.AddModGen()
log.LogDebugf("action[SplitExtentKey] inode %v ek [%v] ekPivot [%v] ekLeft [%v]", inodeID, ek, ekPivot, ekLeft)
// begin
if ek.FileOffset == ekPivot.FileOffset {
ek.Size = ek.Size - ekPivot.Size
ek.FileOffset = ek.FileOffset + uint64(ekPivot.Size)
ek.ExtentOffset = ek.ExtentOffset + uint64(ekPivot.Size)
if ekLeft != nil && ekLeft.IsSequenceWithSameSeq(ekPivot) {
log.LogDebugf("SplitExtentKey.merge.begin. ekLeft %v and %v", ekLeft, ekPivot)
ekLeft.Size += ekPivot.Size
log.LogDebugf("action[SplitExtentKey] inode %v ek [%v], ekPivot[%v] ekLeft[%v]", inodeID, ek, ekPivot, ekLeft)
cache.root.ReplaceOrInsert(ekLeft)
cache.root.ReplaceOrInsert(ek)
cache.gen++
return
}
log.LogDebugf("action[SplitExtentKey] inode %v ek [%v]", inodeID, ek)
} else if ek.FileOffset+uint64(ek.Size) == ekPivot.FileOffset+uint64(ekPivot.Size) { // end
ek.Size = ek.Size - ekPivot.Size
log.LogDebugf("action[SplitExtentKey] inode %v ek [%v]", inodeID, ek)
if ekRight != nil && ekPivot.IsSequenceWithSameSeq(ekRight) {
cache.root.Delete(ekRight)
ekRight.FileOffset = ekPivot.FileOffset
ekRight.ExtentOffset = ekPivot.ExtentOffset
ekRight.Size += ekPivot.Size
cache.root.ReplaceOrInsert(ekRight)
cache.root.ReplaceOrInsert(ek)
log.LogDebugf("SplitExtentKey.merge.end. ek %v and %v", ekPivot, ekRight)
cache.gen++
return
}
} else {
newSize := uint32(ekPivot.FileOffset - ek.FileOffset) // middle
ekEnd := &proto.ExtentKey{
FileOffset: ekPivot.FileOffset + uint64(ekPivot.Size),
PartitionId: ek.PartitionId,
ExtentId: ek.ExtentId,
ExtentOffset: ek.ExtentOffset + uint64(newSize+ekPivot.Size),
Size: ek.Size - newSize - ekPivot.Size,
SnapInfo: &proto.ExtSnapInfo{
VerSeq: ek.GetSeq(),
ModGen: ek.GetModGen(),
},
}
log.LogDebugf("action[SplitExtentKey] inode %v add ekEnd [%v] after split size(%v,%v,%v)", inodeID, ekEnd, newSize, ekPivot.Size, ekEnd.Size)
cache.root.ReplaceOrInsert(ekEnd)
log.LogDebugf("ExtentCache ReplaceOrInsert: ino(%v) ek(%v) ", cache.inode, ekEnd)
ek.Size = newSize
}
cache.root.ReplaceOrInsert(ek)
cache.root.ReplaceOrInsert(ekPivot)
log.LogDebugf("action[SplitExtentKey] inode %v ek [%v], ekPivot[%v]", inodeID, ek, ekPivot)
cache.gen++
// log.LogDebugf("before cache output")
// cache.LogOutPut()
return
}
// Append appends an extent key.
func (cache *ExtentCache) Append(ek *proto.ExtentKey, sync bool) (discardExtents []proto.ExtentKey) {
log.LogDebugf("action[ExtentCache.Append] ek %v", ek)
ekEnd := ek.FileOffset + uint64(ek.Size)
lower := &proto.ExtentKey{FileOffset: ek.FileOffset}
upper := &proto.ExtentKey{FileOffset: ekEnd}
discard := make([]*proto.ExtentKey, 0)
cache.Lock()
defer cache.Unlock()
//cache.root.Descend(func(i btree.Item) bool {
// ek := i.(*proto.ExtentKey)
// // skip if the start offset matches with the given offset
// log.LogDebugf("action[Append.LoopPrint.Enter] inode %v ek [%v]", cache.inode, ek.String())
// return true
//})
// When doing the append, we do not care about the data after the file offset.
// Those data will be overwritten by the current extent anyway.
cache.root.AscendRange(lower, upper, func(i btree.Item) bool {
found := i.(*proto.ExtentKey)
discard = append(discard, found)
return true
})
// After deleting the data between lower and upper, we will do the append
for _, key := range discard {
cache.root.Delete(key)
log.LogDebugf("ExtentCache del: ino(%v) ek(%v) ", cache.inode, key)
if key.PartitionId != 0 && key.ExtentId != 0 && (key.PartitionId != ek.PartitionId || key.ExtentId != ek.ExtentId || ek.ExtentOffset != key.ExtentOffset) {
if sync || (ek.PartitionId == 0 && ek.ExtentId == 0) {
cache.discard.ReplaceOrInsert(key)
// log.LogDebugf("ExtentCache Append add to discard: ino(%v) ek(%v) discard(%v)", cache.inode, ek, key)
}
}
}
cache.root.ReplaceOrInsert(ek)
if sync {
cache.gen++
discardExtents = make([]proto.ExtentKey, 0, cache.discard.Len())
cache.discard.AscendRange(lower, upper, func(i btree.Item) bool {
found := i.(*proto.ExtentKey)
if found.PartitionId != ek.PartitionId || found.ExtentId != ek.ExtentId || found.ExtentOffset != ek.ExtentOffset {
discardExtents = append(discardExtents, *found)
}
return true
})
}
if ekEnd > cache.size {
cache.size = ekEnd
}
log.LogDebugf("ExtentCache Append: ino(%v) sync(%v) ek(%v) local discard(%v) discardExtents(%v), seq(%v)",
cache.inode, sync, ek, discard, discardExtents, ek.GetSeq())
return
}
func (cache *ExtentCache) RemoveDiscard(discardExtents []proto.ExtentKey) {
cache.Lock()
defer cache.Unlock()
for _, ek := range discardExtents {
cache.discard.Delete(&ek)
// log.LogDebugf("ExtentCache ClearDiscard: ino(%v) discard(%v)", cache.inode, ek)
}
}
func (cache *ExtentCache) TruncDiscard(size uint64) {
cache.Lock()
defer cache.Unlock()
if size >= cache.size {
return
}
pivot := &proto.ExtentKey{FileOffset: size}
discardExtents := make([]proto.ExtentKey, 0, cache.discard.Len())
cache.discard.AscendGreaterOrEqual(pivot, func(i btree.Item) bool {
found := i.(*proto.ExtentKey)
discardExtents = append(discardExtents, *found)
return true
})
for _, key := range discardExtents {
cache.discard.Delete(&key)
}
log.LogDebugf("truncate ExtentCache discard: ino(%v) size(%v) discard(%v)", cache.inode, size, discardExtents)
}
// Max returns the max extent key in the cache.
func (cache *ExtentCache) Max() *proto.ExtentKey {
cache.RLock()
defer cache.RUnlock()
ek := cache.root.Max().(*proto.ExtentKey)
return ek
}
// Size returns the size of the cache.
func (cache *ExtentCache) Size() (size int, gen uint64) {
cache.RLock()
defer cache.RUnlock()
return int(cache.size), cache.gen
}
// SetSize set the size of the cache.
func (cache *ExtentCache) SetSize(size uint64, sync bool) {
cache.Lock()
defer cache.Unlock()
cache.size = size
if sync {
cache.gen++
}
}
// List returns a list of the extents in the cache.
func (cache *ExtentCache) List() []*proto.ExtentKey {
cache.RLock()
root := cache.root.Clone()
cache.RUnlock()
extents := make([]*proto.ExtentKey, 0, root.Len())
root.Ascend(func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
extents = append(extents, ek)
return true
})
return extents
}
// Get returns the extent key based on the given offset.
func (cache *ExtentCache) Get(offset uint64) (ret *proto.ExtentKey) {
pivot := &proto.ExtentKey{FileOffset: offset}
cache.RLock()
defer cache.RUnlock()
cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
// log.LogDebugf("ExtentCache GetConnect: ino(%v) ek(%v) offset(%v)", cache.inode, ek, offset)
if offset >= ek.FileOffset && offset < ek.FileOffset+uint64(ek.Size) {
ret = ek
}
return false
})
return ret
}
// GetEndForAppendWrite returns the extent key whose end offset equals the given offset.
func (cache *ExtentCache) GetEndForAppendWrite(offset uint64, verSeq uint64, needCheck bool) (ret *proto.ExtentKey) {
pivot := &proto.ExtentKey{FileOffset: offset}
cache.RLock()
defer cache.RUnlock()
var lastExistEk *proto.ExtentKey
var lastExistEkTest *proto.ExtentKey
cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
// skip if the start offset matches with the given offset
if offset == ek.FileOffset {
lastExistEk = ek
return true
}
if offset == ek.FileOffset+uint64(ek.Size) {
if !needCheck || ek.GetSeq() == verSeq {
if int(ek.ExtentOffset)+int(ek.Size) >= util.ExtentSize {
log.LogDebugf("action[ExtentCache.GetEndForAppendWrite] inode %v req offset %v verseq %v not found, exist ek [%v]",
cache.inode, offset, verSeq, ek.String())
ret = nil
return false
}
//?? should not have the neighbor extent in the next
if lastExistEk != nil && ek.IsFileInSequence(lastExistEk) {
log.LogErrorf("action[ExtentCache.GetEndForAppendWrite] ek %v is InSequence exist sequence extent %v", ek, lastExistEk)
ret = nil
return false
}
log.LogDebugf("action[ExtentCache.GetEndForAppendWrite] inode %v offset %v verseq %v found,ek [%v] lastExistEk[%v], lastExistEkTest[%v]",
cache.inode, offset, verSeq, ek.String(), lastExistEk, lastExistEkTest)
ret = ek
} else {
log.LogDebugf("action[ExtentCache.GetEndForAppendWrite] inode %v req offset %v verseq %v not found, exist ek [%v]", cache.inode, offset, verSeq, ek.String())
}
return false
}
lastExistEkTest = ek
return true
})
return ret
}
// PrepareReadRequests classifies the incoming request.
func (cache *ExtentCache) PrepareReadRequests(offset, size int, data []byte) []*ExtentRequest {
requests := make([]*ExtentRequest, 0)
pivot := &proto.ExtentKey{FileOffset: uint64(offset)}
upper := &proto.ExtentKey{FileOffset: uint64(offset + size)}
start := offset
end := offset + size
cache.RLock()
defer cache.RUnlock()
lower := &proto.ExtentKey{}
cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
lower.FileOffset = ek.FileOffset
return false
})
cache.root.AscendRange(lower, upper, func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
ekStart := int(ek.FileOffset)
ekEnd := int(ek.FileOffset) + int(ek.Size)
log.LogDebugf("PrepareReadRequests: req[ino(%v) start(%v) end(%v)] ek[extentID(%v),FileOffset(Start(%v) End(%v))]",
cache.inode, start, end, ek.ExtentId, ekStart, ekEnd)
if start < ekStart {
if end <= ekStart {
return false
} else if end < ekEnd {
// add hole (start, ekStart)
req := NewExtentRequest(start, ekStart-start, data[start-offset:ekStart-offset], nil)
requests = append(requests, req)
// add non-hole (ekStart, end)
req = NewExtentRequest(ekStart, end-ekStart, data[ekStart-offset:end-offset], ek)
requests = append(requests, req)
start = end
return false
} else {
// add hole (start, ekStart)
req := NewExtentRequest(start, ekStart-start, data[start-offset:ekStart-offset], nil)
requests = append(requests, req)
// add non-hole (ekStart, ekEnd)
req = NewExtentRequest(ekStart, ekEnd-ekStart, data[ekStart-offset:ekEnd-offset], ek)
requests = append(requests, req)
start = ekEnd
return true
}
} else if start < ekEnd {
if end <= ekEnd {
// add non-hole (start, end)
req := NewExtentRequest(start, end-start, data[start-offset:end-offset], ek)
requests = append(requests, req)
start = end
return false
} else {
// add non-hole (start, ekEnd), start = ekEnd
req := NewExtentRequest(start, ekEnd-start, data[start-offset:ekEnd-offset], ek)
requests = append(requests, req)
start = ekEnd
return true
}
} else {
return true
}
})
log.LogDebugf("PrepareReadRequests: ino(%v) start(%v) end(%v)", cache.inode, start, end)
if start < end {
// add hole (start, end)
req := NewExtentRequest(start, end-start, data[start-offset:end-offset], nil)
requests = append(requests, req)
}
return requests
}
// PrepareWriteRequests TODO explain
func (cache *ExtentCache) PrepareWriteRequests(offset, size int, data []byte) []*ExtentRequest {
requests := make([]*ExtentRequest, 0)
pivot := &proto.ExtentKey{FileOffset: uint64(offset)}
upper := &proto.ExtentKey{FileOffset: uint64(offset + size)}
start := offset
end := offset + size
cache.RLock()
defer cache.RUnlock()
lower := &proto.ExtentKey{}
cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
lower.FileOffset = ek.FileOffset
log.LogDebugf("action[ExtentCache.PrepareWriteRequests] ek [%v], pivot[%v]", ek, pivot)
return false
})
cache.root.AscendRange(lower, upper, func(i btree.Item) bool {
ek := i.(*proto.ExtentKey)
ekStart := int(ek.FileOffset)
ekEnd := int(ek.FileOffset) + int(ek.Size)
log.LogDebugf("action[ExtentCache.PrepareWriteRequests]: ino(%v) start(%v) end(%v) ekStart(%v) ekEnd(%v)", cache.inode, start, end, ekStart, ekEnd)
if start <= ekStart {
if end <= ekStart {
return false
} else if end < ekEnd {
var req *ExtentRequest
if start < ekStart {
// add hole (start, ekStart)
req = NewExtentRequest(start, ekStart-start, data[start-offset:ekStart-offset], nil)
requests = append(requests, req)
}
// add non-hole (ekStart, end)
req = NewExtentRequest(ekStart, end-ekStart, data[ekStart-offset:end-offset], ek)
requests = append(requests, req)
start = end
return false
} else {
return true
}
} else if start < ekEnd {
if end <= ekEnd {
// add non-hole (start, end)
req := NewExtentRequest(start, end-start, data[start-offset:end-offset], ek)
requests = append(requests, req)
start = end
return false
} else {
// add non-hole (start, ekEnd), start = ekEnd
req := NewExtentRequest(start, ekEnd-start, data[start-offset:ekEnd-offset], ek)
requests = append(requests, req)
start = ekEnd
return true
}
} else {
return true
}
})
log.LogDebugf("PrepareWriteRequests: ino(%v) start(%v) end(%v)", cache.inode, start, end)
if start < end {
// add hole (start, end)
req := NewExtentRequest(start, end-start, data[start-offset:end-offset], nil)
requests = append(requests, req)
}
return requests
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"container/list"
"context"
"fmt"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/manager"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/sdk/meta"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
"golang.org/x/time/rate"
)
type (
SplitExtentKeyFunc func(parentInode, inode uint64, key proto.ExtentKey) error
AppendExtentKeyFunc func(parentInode, inode uint64, key proto.ExtentKey, discard []proto.ExtentKey) (int, error)
GetExtentsFunc func(inode uint64) (uint64, uint64, []proto.ExtentKey, error)
TruncateFunc func(inode, size uint64, fullPath string) error
EvictIcacheFunc func(inode uint64)
LoadBcacheFunc func(key string, buf []byte, offset uint64, size uint32) (int, error)
CacheBcacheFunc func(key string, buf []byte) error
EvictBacheFunc func(key string) error
)
const (
MaxMountRetryLimit = 6
MountRetryInterval = time.Second * 5
defaultReadLimitRate = rate.Inf
defaultReadLimitBurst = 128
defaultWriteLimitRate = rate.Inf
defaultWriteLimitBurst = 128
defaultStreamerLimit = 100000
defMaxStreamerLimit = 10000000
kHighWatermarkPct = 1.01
slowStreamerEvictNum = 10
fastStreamerEvictNum = 10000
)
var (
// global object pools for memory optimization
openRequestPool *sync.Pool
writeRequestPool *sync.Pool
flushRequestPool *sync.Pool
releaseRequestPool *sync.Pool
truncRequestPool *sync.Pool
evictRequestPool *sync.Pool
)
func init() {
// init object pools
openRequestPool = &sync.Pool{New: func() interface{} {
return &OpenRequest{}
}}
writeRequestPool = &sync.Pool{New: func() interface{} {
return &WriteRequest{}
}}
flushRequestPool = &sync.Pool{New: func() interface{} {
return &FlushRequest{}
}}
releaseRequestPool = &sync.Pool{New: func() interface{} {
return &ReleaseRequest{}
}}
truncRequestPool = &sync.Pool{New: func() interface{} {
return &TruncRequest{}
}}
evictRequestPool = &sync.Pool{New: func() interface{} {
return &EvictRequest{}
}}
}
type ExtentConfig struct {
Volume string
VolumeType int
Masters []string
FollowerRead bool
NearRead bool
Preload bool
ReadRate int64
WriteRate int64
BcacheEnable bool
BcacheDir string
MaxStreamerLimit int64
VerReadSeq uint64
OnAppendExtentKey AppendExtentKeyFunc
OnSplitExtentKey SplitExtentKeyFunc
OnGetExtents GetExtentsFunc
OnTruncate TruncateFunc
OnEvictIcache EvictIcacheFunc
OnLoadBcache LoadBcacheFunc
OnCacheBcache CacheBcacheFunc
OnEvictBcache EvictBacheFunc
DisableMetaCache bool
MinWriteAbleDataPartitionCnt int
}
type MultiVerMgr struct {
verReadSeq uint64 // verSeq in config used as snapshot read
latestVerSeq uint64 // newest verSeq from master for datanode write to check
verList *proto.VolVersionInfoList
sync.RWMutex
}
// ExtentClient defines the struct of the extent client.
type ExtentClient struct {
streamers map[uint64]*Streamer
streamerList *list.List
streamerLock sync.Mutex
maxStreamerLimit int
readLimiter *rate.Limiter
writeLimiter *rate.Limiter
disableMetaCache bool
volumeType int
volumeName string
bcacheEnable bool
bcacheDir string
BcacheHealth bool
preload bool
LimitManager *manager.LimitManager
dataWrapper *wrapper.Wrapper
appendExtentKey AppendExtentKeyFunc
splitExtentKey SplitExtentKeyFunc
getExtents GetExtentsFunc
truncate TruncateFunc
evictIcache EvictIcacheFunc // May be null, must check before using
loadBcache LoadBcacheFunc
cacheBcache CacheBcacheFunc
evictBcache EvictBacheFunc
inflightL1cache sync.Map
inflightL1BigBlock int32
multiVerMgr *MultiVerMgr
}
func (client *ExtentClient) UidIsLimited(uid uint32) bool {
client.dataWrapper.UidLock.RLock()
defer client.dataWrapper.UidLock.RUnlock()
if uInfo, ok := client.dataWrapper.Uids[uid]; ok {
if uInfo.Limited {
log.LogDebugf("uid %v is limited", uid)
return true
}
}
log.LogDebugf("uid %v is not limited", uid)
return false
}
func (client *ExtentClient) evictStreamer() bool {
// remove from list
item := client.streamerList.Back()
if item == nil {
return false
}
client.streamerList.Remove(item)
ino := item.Value.(uint64)
s, ok := client.streamers[ino]
if !ok {
return true
}
if s.isOpen {
client.streamerList.PushFront(ino)
return true
}
delete(s.client.streamers, s.inode)
return true
}
func (client *ExtentClient) batchEvictStramer(batchCnt int) {
client.streamerLock.Lock()
defer client.streamerLock.Unlock()
for cnt := 0; cnt < batchCnt; cnt++ {
ok := client.evictStreamer()
if !ok {
break
}
}
}
func (client *ExtentClient) backgroundEvictStream() {
t := time.NewTicker(2 * time.Second)
for range t.C {
start := time.Now()
streamerSize := client.streamerList.Len()
highWatermark := int(float32(client.maxStreamerLimit) * kHighWatermarkPct)
for streamerSize > client.maxStreamerLimit {
// fast evict
if streamerSize > highWatermark {
client.batchEvictStramer(fastStreamerEvictNum)
} else {
client.batchEvictStramer(slowStreamerEvictNum)
}
streamerSize = client.streamerList.Len()
log.LogInfof("batch evict cnt(%d), cost(%d), now(%d)", 1, time.Since(start).Microseconds(), streamerSize)
}
log.LogInfof("streamer total cnt(%d), cost(%d) ns", streamerSize, time.Since(start).Nanoseconds())
}
}
// NewExtentClient returns a new extent client.
func NewExtentClient(config *ExtentConfig) (client *ExtentClient, err error) {
client = new(ExtentClient)
client.LimitManager = manager.NewLimitManager(client)
client.LimitManager.WrapperUpdate = client.UploadFlowInfo
limit := 0
retry:
client.dataWrapper, err = wrapper.NewDataPartitionWrapper(client, config.Volume, config.Masters, config.Preload, config.MinWriteAbleDataPartitionCnt, config.VerReadSeq)
if err != nil {
log.LogErrorf("NewExtentClient: new data partition wrapper failed: volume(%v) mayRetry(%v) err(%v)",
config.Volume, limit, err)
if strings.Contains(err.Error(), proto.ErrVolNotExists.Error()) {
return nil, proto.ErrVolNotExists
}
if limit >= MaxMountRetryLimit {
return nil, errors.Trace(err, "Init data wrapper failed!")
} else {
limit++
time.Sleep(MountRetryInterval * time.Duration(limit))
goto retry
}
}
client.streamers = make(map[uint64]*Streamer)
client.multiVerMgr = &MultiVerMgr{verList: &proto.VolVersionInfoList{}}
client.appendExtentKey = config.OnAppendExtentKey
client.splitExtentKey = config.OnSplitExtentKey
client.getExtents = config.OnGetExtents
client.truncate = config.OnTruncate
client.evictIcache = config.OnEvictIcache
client.dataWrapper.InitFollowerRead(config.FollowerRead)
client.dataWrapper.SetNearRead(config.NearRead)
client.loadBcache = config.OnLoadBcache
client.cacheBcache = config.OnCacheBcache
client.evictBcache = config.OnEvictBcache
client.volumeType = config.VolumeType
client.volumeName = config.Volume
client.bcacheEnable = config.BcacheEnable
client.bcacheDir = config.BcacheDir
client.multiVerMgr.verReadSeq = client.dataWrapper.GetReadVerSeq()
client.BcacheHealth = true
client.preload = config.Preload
client.disableMetaCache = config.DisableMetaCache
var readLimit, writeLimit rate.Limit
if config.ReadRate <= 0 {
readLimit = defaultReadLimitRate
} else {
readLimit = rate.Limit(config.ReadRate)
}
if config.WriteRate <= 0 {
writeLimit = defaultWriteLimitRate
} else {
writeLimit = rate.Limit(config.WriteRate)
}
client.readLimiter = rate.NewLimiter(readLimit, defaultReadLimitBurst)
client.writeLimiter = rate.NewLimiter(writeLimit, defaultWriteLimitBurst)
if config.MaxStreamerLimit <= 0 {
client.disableMetaCache = true
return
}
if config.MaxStreamerLimit <= defaultStreamerLimit {
client.maxStreamerLimit = defaultStreamerLimit
} else if config.MaxStreamerLimit > defMaxStreamerLimit {
client.maxStreamerLimit = defMaxStreamerLimit
} else {
client.maxStreamerLimit = int(config.MaxStreamerLimit)
}
client.maxStreamerLimit += fastStreamerEvictNum
log.LogInfof("max streamer limit %d", client.maxStreamerLimit)
client.streamerList = list.New()
go client.backgroundEvictStream()
return
}
func (client *ExtentClient) GetEnablePosixAcl() bool {
return client.dataWrapper.EnablePosixAcl
}
func (client *ExtentClient) GetFlowInfo() (*proto.ClientReportLimitInfo, bool) {
log.LogInfof("action[ExtentClient.GetFlowInfo]")
return client.LimitManager.GetFlowInfo()
}
func (client *ExtentClient) UpdateFlowInfo(limit *proto.LimitRsp2Client) {
log.LogInfof("action[UpdateFlowInfo.UpdateFlowInfo]")
client.LimitManager.SetClientLimit(limit)
return
}
func (client *ExtentClient) SetClientID(id uint64) (err error) {
client.LimitManager.ID = id
return
}
func (client *ExtentClient) GetVolumeName() string {
return client.volumeName
}
func (client *ExtentClient) GetLatestVer() uint64 {
return atomic.LoadUint64(&client.multiVerMgr.latestVerSeq)
}
func (client *ExtentClient) GetReadVer() uint64 {
return atomic.LoadUint64(&client.multiVerMgr.verReadSeq)
}
func (client *ExtentClient) GetVerMgr() *proto.VolVersionInfoList {
return client.multiVerMgr.verList
}
func (client *ExtentClient) UpdateLatestVer(verList *proto.VolVersionInfoList) (err error) {
verSeq := verList.GetLastVer()
log.LogDebugf("action[UpdateLatestVer] verSeq %v verList[%v] mgr seq %v", verSeq, verList, client.multiVerMgr.latestVerSeq)
if verSeq == 0 || verSeq <= atomic.LoadUint64(&client.multiVerMgr.latestVerSeq) {
return
}
client.multiVerMgr.Lock()
defer client.multiVerMgr.Unlock()
if verSeq <= atomic.LoadUint64(&client.multiVerMgr.latestVerSeq) {
return
}
log.LogDebugf("action[UpdateLatestVer] update verSeq [%v] to [%v]", client.multiVerMgr.latestVerSeq, verSeq)
atomic.StoreUint64(&client.multiVerMgr.latestVerSeq, verSeq)
client.multiVerMgr.verList = verList
client.streamerLock.Lock()
defer client.streamerLock.Unlock()
for _, streamer := range client.streamers {
if streamer.verSeq != verSeq {
log.LogDebugf("action[ExtentClient.UpdateLatestVer] stream inode %v ver %v try update to %v", streamer.inode, streamer.verSeq, verSeq)
oldVer := streamer.verSeq
streamer.verSeq = verSeq
streamer.extents.verSeq = verSeq
if err = streamer.GetExtentsForce(); err != nil {
log.LogErrorf("action[UpdateLatestVer] inode %v streamer %v", streamer.inode, streamer.verSeq)
streamer.verSeq = oldVer
streamer.extents.verSeq = oldVer
return err
}
atomic.StoreInt32(&streamer.needUpdateVer, 1)
log.LogDebugf("action[ExtentClient.UpdateLatestVer] finhsed stream inode %v ver update to %v", streamer.inode, verSeq)
}
}
return nil
}
// Open request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) OpenStream(inode uint64) error {
client.streamerLock.Lock()
s, ok := client.streamers[inode]
if !ok {
s = NewStreamer(client, inode)
client.streamers[inode] = s
}
return s.IssueOpenRequest()
}
// Open request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) OpenStreamWithCache(inode uint64, needBCache bool) error {
client.streamerLock.Lock()
s, ok := client.streamers[inode]
if !ok {
s = NewStreamer(client, inode)
client.streamers[inode] = s
if !client.disableMetaCache && needBCache {
client.streamerList.PushFront(inode)
}
}
s.needBCache = needBCache
if !s.isOpen && !client.disableMetaCache {
s.isOpen = true
log.LogDebugf("open stream again, ino(%v)", s.inode)
s.request = make(chan interface{}, 64)
s.pendingCache = make(chan bcacheKey, 1)
go s.server()
go s.asyncBlockCache()
}
return s.IssueOpenRequest()
}
// Release request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) CloseStream(inode uint64) error {
client.streamerLock.Lock()
s, ok := client.streamers[inode]
if !ok {
client.streamerLock.Unlock()
return nil
}
return s.IssueReleaseRequest()
}
// Evict request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) EvictStream(inode uint64) error {
client.streamerLock.Lock()
s, ok := client.streamers[inode]
if !ok {
client.streamerLock.Unlock()
return nil
}
if s.isOpen {
s.isOpen = false
err := s.IssueEvictRequest()
if err != nil {
return err
}
s.done <- struct{}{}
} else {
delete(s.client.streamers, s.inode)
s.client.streamerLock.Unlock()
}
return nil
}
// RefreshExtentsCache refreshes the extent cache.
func (client *ExtentClient) RefreshExtentsCache(inode uint64) error {
s := client.GetStreamer(inode)
if s == nil {
return nil
}
return s.GetExtents()
}
func (client *ExtentClient) ForceRefreshExtentsCache(inode uint64) error {
s := client.GetStreamer(inode)
if s == nil {
return nil
}
return s.GetExtentsForce()
}
// GetExtentCacheGen return extent generation
func (client *ExtentClient) GetExtentCacheGen(inode uint64) uint64 {
s := client.GetStreamer(inode)
if s == nil {
return 0
}
return s.extents.gen
}
func (client *ExtentClient) GetExtents(inode uint64) []*proto.ExtentKey {
s := client.GetStreamer(inode)
if s == nil {
return nil
}
return s.extents.List()
}
// FileSize returns the file size.
func (client *ExtentClient) FileSize(inode uint64) (size int, gen uint64, valid bool) {
s := client.GetStreamer(inode)
if s == nil {
return
}
valid = true
size, gen = s.extents.Size()
return
}
// SetFileSize set the file size.
func (client *ExtentClient) SetFileSize(inode uint64, size int) {
s := client.GetStreamer(inode)
if s != nil {
log.LogDebugf("SetFileSize: ino(%v) size(%v)", inode, size)
s.extents.SetSize(uint64(size), true)
}
}
// Write writes the data.
func (client *ExtentClient) Write(inode uint64, offset int, data []byte, flags int, checkFunc func() error) (write int, err error) {
prefix := fmt.Sprintf("Write{ino(%v)offset(%v)size(%v)}", inode, offset, len(data))
s := client.GetStreamer(inode)
if s == nil {
log.LogErrorf("Prefix(%v): stream is not opened yet", prefix)
return 0, syscall.EBADF
}
s.once.Do(func() {
// TODO unhandled error
s.GetExtents()
})
write, err = s.IssueWriteRequest(offset, data, flags, checkFunc)
if err != nil {
log.LogError(errors.Stack(err))
exporter.Warning(err.Error())
}
return
}
func (client *ExtentClient) Truncate(mw *meta.MetaWrapper, parentIno uint64, inode uint64, size int, fullPath string) error {
prefix := fmt.Sprintf("Truncate{ino(%v)size(%v)}", inode, size)
s := client.GetStreamer(inode)
if s == nil {
log.LogErrorf("Prefix(%v): stream is not opened yet", prefix)
return syscall.EBADF
}
var info *proto.InodeInfo
var err error
var oldSize uint64
if mw.EnableSummary {
info, err = mw.InodeGet_ll(inode)
oldSize = info.Size
}
err = s.IssueTruncRequest(size, fullPath)
if err != nil {
err = errors.Trace(err, prefix)
log.LogError(errors.Stack(err))
}
if mw.EnableSummary {
go mw.UpdateSummary_ll(parentIno, 0, 0, int64(size)-int64(oldSize))
}
return err
}
func (client *ExtentClient) Flush(inode uint64) error {
s := client.GetStreamer(inode)
if s == nil {
log.LogErrorf("Flush: stream is not opened yet, ino(%v)", inode)
return syscall.EBADF
}
return s.IssueFlushRequest()
}
func (client *ExtentClient) Read(inode uint64, data []byte, offset int, size int) (read int, err error) {
// log.LogErrorf("======> ExtentClient Read Enter, inode(%v), len(data)=(%v), offset(%v), size(%v).", inode, len(data), offset, size)
// t1 := time.Now()
if size == 0 {
return
}
s := client.GetStreamer(inode)
if s == nil {
log.LogErrorf("Read: stream is not opened yet, ino(%v) offset(%v) size(%v)", inode, offset, size)
return 0, syscall.EBADF
}
s.once.Do(func() {
s.GetExtents()
})
err = s.IssueFlushRequest()
if err != nil {
return
}
read, err = s.read(data, offset, size)
// log.LogErrorf("======> ExtentClient Read Exit, inode(%v), time[%v us].", inode, time.Since(t1).Microseconds())
return
}
func (client *ExtentClient) ReadExtent(inode uint64, ek *proto.ExtentKey, data []byte, offset int, size int) (read int, err error, isStream bool) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("read-extent", err, bgTime, 1)
}()
var reader *ExtentReader
var req *ExtentRequest
if size == 0 {
return
}
s := client.GetStreamer(inode)
if s == nil {
err = fmt.Errorf("Read: stream is not opened yet, ino(%v) ek(%v)", inode, ek)
return
}
err = s.IssueFlushRequest()
if err != nil {
return
}
reader, err = s.GetExtentReader(ek)
if err != nil {
return
}
needCache := false
cacheKey := util.GenerateKey(s.client.volumeName, s.inode, ek.FileOffset)
if _, ok := client.inflightL1cache.Load(cacheKey); !ok && client.shouldBcache() {
client.inflightL1cache.Store(cacheKey, true)
needCache = true
}
defer client.inflightL1cache.Delete(cacheKey)
// do cache.
if needCache {
// read full extent
buf := make([]byte, ek.Size)
req = NewExtentRequest(int(ek.FileOffset), int(ek.Size), buf, ek)
read, err = reader.Read(req)
if err != nil {
return
}
read = copy(data, req.Data[offset:offset+size])
if client.cacheBcache != nil {
buf := make([]byte, len(req.Data))
copy(buf, req.Data)
go func() {
log.LogDebugf("ReadExtent L2->L1 Enter cacheKey(%v),client.shouldBcache(%v),needCache(%v)", cacheKey, client.shouldBcache(), needCache)
if err := client.cacheBcache(cacheKey, buf); err != nil {
client.BcacheHealth = false
log.LogDebugf("ReadExtent L2->L1 failed, err(%v), set BcacheHealth to false.", err)
}
log.LogDebugf("ReadExtent L2->L1 Exit cacheKey(%v),client.BcacheHealth(%v),needCache(%v)", cacheKey, client.BcacheHealth, needCache)
}()
}
return
} else {
// read data by offset:size
req = NewExtentRequest(int(ek.FileOffset)+offset, size, data, ek)
ctx := context.Background()
s.client.readLimiter.Wait(ctx)
s.client.LimitManager.ReadAlloc(ctx, size)
isStream = true
read, err = reader.Read(req)
if err != nil {
return
}
read = copy(data, req.Data)
return
}
}
// GetStreamer returns the streamer.
func (client *ExtentClient) GetStreamer(inode uint64) *Streamer {
client.streamerLock.Lock()
defer client.streamerLock.Unlock()
s, ok := client.streamers[inode]
if !ok {
return nil
}
if !s.isOpen {
s.isOpen = true
s.request = make(chan interface{}, 64)
s.pendingCache = make(chan bcacheKey, 1)
go s.server()
go s.asyncBlockCache()
}
return s
}
func (client *ExtentClient) GetRate() string {
return fmt.Sprintf("read: %v\nwrite: %v\n", getRate(client.readLimiter), getRate(client.writeLimiter))
}
func (client *ExtentClient) shouldBcache() bool {
return client.bcacheEnable && client.BcacheHealth
}
func getRate(lim *rate.Limiter) string {
val := int(lim.Limit())
if val > 0 {
return fmt.Sprintf("%v", val)
}
return "unlimited"
}
func (client *ExtentClient) SetReadRate(val int) string {
return setRate(client.readLimiter, val)
}
func (client *ExtentClient) SetWriteRate(val int) string {
return setRate(client.writeLimiter, val)
}
func setRate(lim *rate.Limiter, val int) string {
if val > 0 {
lim.SetLimit(rate.Limit(val))
return fmt.Sprintf("%v", val)
}
lim.SetLimit(rate.Inf)
return "unlimited"
}
func (client *ExtentClient) Close() error {
// release streamers
var inodes []uint64
client.streamerLock.Lock()
inodes = make([]uint64, 0, len(client.streamers))
for inode := range client.streamers {
inodes = append(inodes, inode)
}
client.streamerLock.Unlock()
for _, inode := range inodes {
_ = client.EvictStream(inode)
}
client.dataWrapper.Stop()
return nil
}
func (client *ExtentClient) AllocatePreLoadDataPartition(volName string, count int, capacity, ttl uint64, zones string) (err error) {
return client.dataWrapper.AllocatePreLoadDataPartition(volName, count, capacity, ttl, zones)
}
func (client *ExtentClient) CheckDataPartitionExsit(partitionID uint64) error {
_, err := client.dataWrapper.GetDataPartition(partitionID)
return err
}
func (client *ExtentClient) GetDataPartitionForWrite() error {
exclude := make(map[string]struct{})
_, err := client.dataWrapper.GetDataPartitionForWrite(exclude)
return err
}
func (client *ExtentClient) UpdateDataPartitionForColdVolume() error {
return client.dataWrapper.UpdateDataPartition()
}
func (client *ExtentClient) IsPreloadMode() bool {
return client.preload
}
func (client *ExtentClient) UploadFlowInfo(clientInfo wrapper.SimpleClientInfo) error {
return client.dataWrapper.UploadFlowInfo(clientInfo, false)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"container/list"
"sync"
)
// DirtyExtentList defines the struct of the dirty extent list.
type DirtyExtentList struct {
sync.RWMutex
list *list.List
}
// NewDirtyExtentList returns a new DirtyExtentList instance.
func NewDirtyExtentList() *DirtyExtentList {
return &DirtyExtentList{
list: list.New(),
}
}
// Put puts a new extent handler into the dirty extent list.
func (dl *DirtyExtentList) Put(eh *ExtentHandler) {
dl.Lock()
defer dl.Unlock()
dl.list.PushBack(eh)
}
// Get gets the next element in the dirty extent list.
func (dl *DirtyExtentList) Get() *list.Element {
dl.RLock()
defer dl.RUnlock()
return dl.list.Front()
}
// Remove removes the element from the dirty extent list.
func (dl *DirtyExtentList) Remove(e *list.Element) {
dl.Lock()
defer dl.Unlock()
dl.list.Remove(e)
}
// Len returns the size of the dirty extent list.
func (dl *DirtyExtentList) Len() int {
dl.RLock()
defer dl.RUnlock()
return dl.list.Len()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"fmt"
"net"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/sdk/meta"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
// State machines
const (
ExtentStatusOpen int32 = iota
ExtentStatusClosed
ExtentStatusRecovery
ExtentStatusError
)
var gExtentHandlerID = uint64(0)
// GetExtentHandlerID returns the extent handler ID.
func GetExtentHandlerID() uint64 {
return atomic.AddUint64(&gExtentHandlerID, 1)
}
// ExtentHandler defines the struct of the extent handler.
type ExtentHandler struct {
// Fields created as it is, i.e. will not be changed.
stream *Streamer
id uint64 // extent handler id
inode uint64
fileOffset int
storeMode int
// Either open/closed/recovery/error.
// Can transit from one state to the next adjacent state ONLY.
status int32
// Created, filled and sent in Write.
packet *Packet
// Updated in *write* method ONLY.
size int
// Pending packets in sender and receiver.
// Does not involve the packet in open handler.
inflight int32
// For ExtentStore,the extent ID is assigned in the sender.
// For TinyStore, the extent ID is assigned in the receiver.
// Will not be changed once assigned.
extID int
// Allocated in the sender, and released in the receiver.
// Will not be changed.
conn *net.TCPConn
dp *wrapper.DataPartition
// Issue a signal to this channel when *inflight* hits zero.
// To wake up *waitForFlush*.
empty chan struct{}
// Created and updated in *receiver* ONLY.
// Not protected by lock, therefore can be used ONLY when there is no
// pending and new packets.
key *proto.ExtentKey
dirty bool // indicate if open handler is dirty.
// Created in receiver ONLY in recovery status.
// Will not be changed once assigned.
recoverHandler *ExtentHandler
// The stream writer gets the write requests, and constructs the packets
// to be sent to the request channel.
// The *sender* gets the packets from the *request* channel, sends it to the corresponding data
// node, and then throw it back to the *reply* channel.
// The *receiver* gets the packets from the *reply* channel, waits for the
// reply from the data node, and then deals with it.
request chan *Packet
reply chan *Packet
// Signaled in stream writer ONLY to exit *receiver*.
doneReceiver chan struct{}
// Signaled in receiver ONLY to exit *sender*.
doneSender chan struct{}
// ver update need alloc new extent
verUpdate chan uint64
appendLK sync.Mutex
lastKey proto.ExtentKey
}
// NewExtentHandler returns a new extent handler.
func NewExtentHandler(stream *Streamer, offset int, storeMode int, size int) *ExtentHandler {
// log.LogDebugf("NewExtentHandler stack(%v)", string(debug.Stack()))
eh := &ExtentHandler{
stream: stream,
id: GetExtentHandlerID(),
inode: stream.inode,
fileOffset: offset,
size: size,
storeMode: storeMode,
empty: make(chan struct{}, 1024),
request: make(chan *Packet, 1024),
reply: make(chan *Packet, 1024),
doneSender: make(chan struct{}),
doneReceiver: make(chan struct{}),
}
go eh.receiver()
go eh.sender()
return eh
}
// String returns the string format of the extent handler.
func (eh *ExtentHandler) String() string {
return fmt.Sprintf("ExtentHandler{ID(%v)Inode(%v)FileOffset(%v)Size(%v)StoreMode(%v)Status(%v)Dp(%v)Ver(%v)key(%v)lastKey(%v)}",
eh.id, eh.inode, eh.fileOffset, eh.size, eh.storeMode, eh.status, eh.dp, eh.stream.verSeq, eh.key, eh.lastKey)
}
func (eh *ExtentHandler) write(data []byte, offset, size int, direct bool) (ek *proto.ExtentKey, err error) {
var total, write int
status := eh.getStatus()
if status >= ExtentStatusClosed {
err = errors.NewErrorf("ExtentHandler Write: Full or Recover eh(%v) key(%v)", eh, eh.key)
return
}
var blksize int
if eh.storeMode == proto.TinyExtentType {
blksize = eh.stream.tinySizeLimit()
} else {
blksize = util.BlockSize
}
// If this write request is not continuous, and cannot be merged
// into the extent handler, just close it and return error.
// In this case, the caller should try to create a new extent handler.
if proto.IsHot(eh.stream.client.volumeType) {
if eh.fileOffset+eh.size != offset || eh.size+size > util.ExtentSize ||
(eh.storeMode == proto.TinyExtentType && eh.size+size > blksize) {
err = errors.New("ExtentHandler: full or incontinuous")
return
}
}
for total < size {
if eh.packet == nil {
eh.packet = NewWritePacket(eh.inode, offset+total, eh.storeMode)
if direct {
eh.packet.Opcode = proto.OpSyncWrite
}
// log.LogDebugf("ExtentHandler Write: NewPacket, eh(%v) packet(%v)", eh, eh.packet)
}
packsize := int(eh.packet.Size)
write = util.Min(size-total, blksize-packsize)
if write > 0 {
copy(eh.packet.Data[packsize:packsize+write], data[total:total+write])
eh.packet.Size += uint32(write)
total += write
}
if int(eh.packet.Size) >= blksize {
eh.flushPacket()
}
}
eh.size += total
// This is just a local cache to prepare write requests.
// Partition and extent are not allocated.
ek = &proto.ExtentKey{
FileOffset: uint64(eh.fileOffset),
Size: uint32(eh.size),
}
return ek, nil
}
func (eh *ExtentHandler) sender() {
var err error
for {
select {
case packet := <-eh.request:
log.LogDebugf("ExtentHandler sender begin: eh(%v) packet(%v)", eh, packet)
if eh.getStatus() >= ExtentStatusRecovery {
log.LogWarnf("sender in recovery: eh(%v) packet(%v)", eh, packet)
eh.reply <- packet
continue
}
// Initialize dp, conn, and extID
if eh.dp == nil {
if err = eh.allocateExtent(); err != nil {
eh.setClosed()
eh.setRecovery()
// if dp is not specified and yet we failed, then error out.
// otherwise, just try to recover.
if eh.key == nil {
eh.setError()
log.LogErrorf("sender: eh(%v) err(%v)", eh, err)
} else {
log.LogWarnf("sender: eh(%v) err(%v)", eh, err)
}
eh.reply <- packet
continue
}
}
// For ExtentStore, calculate the extent offset.
// For TinyStore, the extent offset is always 0 in the request packet,
// and the reply packet tells the real extent offset.
extOffset := int(packet.KernelOffset) - eh.fileOffset
if eh.key != nil {
extOffset += int(eh.key.ExtentOffset)
}
// fill the packet according to the extent
packet.PartitionID = eh.dp.PartitionID
packet.ExtentType = uint8(eh.storeMode)
packet.ExtentID = uint64(eh.extID)
packet.ExtentOffset = int64(extOffset)
packet.Arg = ([]byte)(eh.dp.GetAllAddrs())
packet.ArgLen = uint32(len(packet.Arg))
packet.RemainingFollowers = uint8(len(eh.dp.Hosts) - 1)
if len(eh.dp.Hosts) == 1 {
packet.RemainingFollowers = 127
}
packet.StartT = time.Now().UnixNano()
log.LogDebugf("ExtentHandler sender: extent allocated, eh(%v) dp(%v) extID(%v) packet(%v)", eh, eh.dp, eh.extID, packet.GetUniqueLogId())
if err = packet.writeToConn(eh.conn); err != nil {
log.LogWarnf("sender writeTo: failed, eh(%v) err(%v) packet(%v)", eh, err, packet)
eh.setClosed()
eh.setRecovery()
}
eh.reply <- packet
case <-eh.doneSender:
eh.setClosed()
log.LogDebugf("sender: done, eh(%v) size(%v) ek(%v)", eh, eh.size, eh.key)
return
}
}
}
func (eh *ExtentHandler) receiver() {
for {
select {
case packet := <-eh.reply:
eh.processReply(packet)
case <-eh.doneReceiver:
log.LogDebugf("receiver done: eh(%v) size(%v) ek(%v)", eh, eh.size, eh.key)
return
}
}
}
func (eh *ExtentHandler) processReply(packet *Packet) {
defer func() {
if atomic.AddInt32(&eh.inflight, -1) <= 0 {
eh.empty <- struct{}{}
}
}()
status := eh.getStatus()
if status >= ExtentStatusError {
eh.discardPacket(packet)
log.LogErrorf("processReply discard packet: handler is in error status, inflight(%v) eh(%v) packet(%v)", atomic.LoadInt32(&eh.inflight), eh, packet)
return
} else if status >= ExtentStatusRecovery {
if err := eh.recoverPacket(packet); err != nil {
eh.discardPacket(packet)
log.LogErrorf("processReply discard packet: handler is in recovery status, inflight(%v) eh(%v) packet(%v) err(%v)", atomic.LoadInt32(&eh.inflight), eh, packet, err)
}
log.LogDebugf("processReply recover packet: handler is in recovery status, inflight(%v) from eh(%v) to recoverHandler(%v) packet(%v)", atomic.LoadInt32(&eh.inflight), eh, eh.recoverHandler, packet)
return
}
var verUpdate bool
reply := NewReply(packet.ReqID, packet.PartitionID, packet.ExtentID)
err := reply.ReadFromConnWithVer(eh.conn, proto.ReadDeadlineTime)
if err != nil {
eh.processReplyError(packet, err.Error())
return
}
if reply.VerSeq > atomic.LoadUint64(&eh.stream.verSeq) || (eh.key != nil && reply.VerSeq > eh.key.GetSeq()) {
log.LogDebugf("processReply.UpdateLatestVer update verseq according to data rsp from version %v to %v", eh.stream.verSeq, reply.VerSeq)
if err = eh.stream.client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: reply.VerList}); err != nil {
eh.processReplyError(packet, err.Error())
return
}
if err = eh.appendExtentKey(); err != nil {
eh.processReplyError(packet, err.Error())
return
}
eh.key = nil
verUpdate = true
}
if reply.ResultCode != proto.OpOk {
if reply.ResultCode != proto.ErrCodeVersionOpError {
errmsg := fmt.Sprintf("reply NOK: reply(%v)", reply)
log.LogDebugf("processReply packet (%v) errmsg (%v)", packet, errmsg)
eh.processReplyError(packet, errmsg)
return
}
// todo(leonchang) need check safety
log.LogWarnf("processReply: get reply, eh(%v) packet(%v) reply(%v)", eh, packet, reply)
eh.stream.GetExtentsForce()
}
if !packet.isValidWriteReply(reply) {
errmsg := fmt.Sprintf("request and reply does not match: reply(%v)", reply)
eh.processReplyError(packet, errmsg)
return
}
if reply.CRC != packet.CRC {
errmsg := fmt.Sprintf("inconsistent CRC: reqCRC(%v) replyCRC(%v) reply(%v) ", packet.CRC, reply.CRC, reply)
eh.processReplyError(packet, errmsg)
return
}
eh.dp.RecordWrite(packet.StartT)
var extID, extOffset uint64
if eh.storeMode == proto.TinyExtentType {
extID = reply.ExtentID
extOffset = uint64(reply.ExtentOffset)
} else {
extID = packet.ExtentID
extOffset = packet.KernelOffset - uint64(eh.fileOffset)
}
fileOffset := uint64(eh.fileOffset)
if verUpdate {
fileOffset = reply.KernelOffset
}
if eh.key == nil || verUpdate {
eh.key = &proto.ExtentKey{
FileOffset: fileOffset,
PartitionId: packet.PartitionID,
ExtentId: extID,
ExtentOffset: extOffset,
Size: packet.Size,
SnapInfo: &proto.ExtSnapInfo{
VerSeq: reply.VerSeq,
},
}
} else {
eh.key.Size += packet.Size
}
proto.Buffers.Put(packet.Data)
packet.Data = nil
eh.dirty = true
return
}
func (eh *ExtentHandler) processReplyError(packet *Packet, errmsg string) {
eh.setClosed()
eh.setRecovery()
if err := eh.recoverPacket(packet); err != nil {
eh.discardPacket(packet)
log.LogErrorf("processReplyError discard packet: eh(%v) packet(%v) err(%v) errmsg(%v)", eh, packet, err, errmsg)
}
}
func (eh *ExtentHandler) flush() (err error) {
eh.flushPacket()
eh.waitForFlush()
err = eh.appendExtentKey()
if err != nil {
return
}
if eh.storeMode == proto.TinyExtentType {
eh.setClosed()
}
status := eh.getStatus()
if status >= ExtentStatusError {
err = errors.New(fmt.Sprintf("StreamWriter flush: extent handler in error status, eh(%v) size(%v)", eh, eh.size))
}
return
}
func (eh *ExtentHandler) cleanup() (err error) {
eh.doneSender <- struct{}{}
eh.doneReceiver <- struct{}{}
if eh.conn != nil {
conn := eh.conn
eh.conn = nil
// TODO unhandled error
if status := eh.getStatus(); status >= ExtentStatusRecovery {
StreamConnPool.PutConnect(conn, true)
} else {
StreamConnPool.PutConnect(conn, false)
}
}
return
}
// can ONLY be called when the handler is not open any more
func (eh *ExtentHandler) appendExtentKey() (err error) {
eh.appendLK.Lock()
defer eh.appendLK.Unlock()
if eh.key != nil {
if eh.dirty {
if proto.IsCold(eh.stream.client.volumeType) && eh.status == ExtentStatusError {
return
}
var (
discard []proto.ExtentKey
status int
)
ekey := *eh.key
doAppend := func() (err error) {
discard = eh.stream.extents.Append(&ekey, true)
status, err = eh.stream.client.appendExtentKey(eh.stream.parentInode, eh.inode, ekey, discard)
if atomic.LoadInt32(&eh.stream.needUpdateVer) > 0 {
if errUpdateExtents := eh.stream.GetExtentsForce(); errUpdateExtents != nil {
log.LogErrorf("action[appendExtentKey] inode %v GetExtents err %v errUpdateExtents %v", eh.stream.inode, err, errUpdateExtents)
return
}
}
if err == nil && len(discard) > 0 {
eh.stream.extents.RemoveDiscard(discard)
}
return
}
if err = doAppend(); err == nil {
eh.dirty = false
eh.lastKey = *eh.key
log.LogDebugf("action[appendExtentKey] status %v, needUpdateVer %v, eh{%v}", status, eh.stream.needUpdateVer, eh)
return
}
// Due to the asynchronous synchronization of version numbers, the extent cache version of the client is updated first before being written to the meta.
// However, it is possible for the client version to lag behind the meta version, resulting in partial inconsistencies in judgment.
// For example, if the version remains unchanged in the client,
// the append-write principle is to reuse the extent key while changing the length. But if the meta has already changed its version,
// a new extent key information needs to be constructed for retrying the operation.
log.LogWarnf("action[appendExtentKey] status %v, handler %v, err %v", status, eh, err)
if status == meta.StatusConflictExtents &&
(atomic.LoadInt32(&eh.stream.needUpdateVer) > 0 || eh.stream.verSeq > 0) &&
eh.lastKey.PartitionId != 0 {
log.LogDebugf("action[appendExtentKey] do append again err %v, key %v", err, ekey)
if eh.lastKey.IsSameExtent(&ekey) &&
eh.lastKey.FileOffset == ekey.FileOffset &&
eh.lastKey.ExtentOffset == ekey.ExtentOffset &&
eh.lastKey.Size < ekey.Size {
ekey.FileOffset += uint64(eh.lastKey.Size)
ekey.ExtentOffset += uint64(eh.lastKey.Size)
ekey.Size -= eh.lastKey.Size
ekey.SetSeq(eh.stream.verSeq)
eh.lastKey = ekey
if err = doAppend(); err != nil {
eh.key = nil
eh.lastKey.PartitionId = 0
} else {
*eh.key = ekey
}
log.LogDebugf("action[appendExtentKey] do append again err %v, key %v", err, ekey)
}
}
} else {
/*
* Update extents cache using the ek stored in the eh. This is
* indispensable because the ek in the extent cache might be
* a temp one with dpid 0, especially when current eh failed and
* create a new eh to do recovery.
*/
_ = eh.stream.extents.Append(eh.key, false)
}
}
if err == nil {
eh.dirty = false
} else {
log.LogErrorf("action[appendExtentKey] %v do append again err %v", eh, err)
eh.lastKey.PartitionId = 0
}
return
}
// This function is meaningful to be called from stream writer flush method,
// because there is no new write request.
func (eh *ExtentHandler) waitForFlush() {
if atomic.LoadInt32(&eh.inflight) <= 0 {
return
}
// t := time.NewTicker(10 * time.Second)
// defer t.Stop()
for {
select {
case <-eh.empty:
if atomic.LoadInt32(&eh.inflight) <= 0 {
return
}
// case <-t.C:
// if atomic.LoadInt32(&eh.inflight) <= 0 {
// return
// }
}
}
}
func (eh *ExtentHandler) recoverPacket(packet *Packet) error {
packet.errCount++
if packet.errCount >= MaxPacketErrorCount || proto.IsCold(eh.stream.client.volumeType) {
return errors.New(fmt.Sprintf("recoverPacket failed: reach max error limit, eh(%v) packet(%v)", eh, packet))
}
handler := eh.recoverHandler
if handler == nil {
// Always use normal extent store mode for recovery.
// Because tiny extent files are limited, tiny store
// failures might due to lack of tiny extent file.
handler = NewExtentHandler(eh.stream, int(packet.KernelOffset), proto.NormalExtentType, 0)
handler.setClosed()
}
handler.pushToRequest(packet)
if eh.recoverHandler == nil {
eh.recoverHandler = handler
// Note: put it to dirty list after packet is sent, so this
// handler is not skipped in flush.
eh.stream.dirtylist.Put(handler)
}
return nil
}
func (eh *ExtentHandler) discardPacket(packet *Packet) {
proto.Buffers.Put(packet.Data)
packet.Data = nil
eh.setError()
}
func (eh *ExtentHandler) allocateExtent() (err error) {
var (
dp *wrapper.DataPartition
conn *net.TCPConn
extID int
)
log.LogDebugf("ExtentHandler allocateExtent enter: eh(%v)", eh)
exclude := make(map[string]struct{})
for i := 0; i < MaxSelectDataPartitionForWrite; i++ {
if eh.key == nil {
if dp, err = eh.stream.client.dataWrapper.GetDataPartitionForWrite(exclude); err != nil {
log.LogWarnf("allocateExtent: failed to get write data partition, eh(%v) exclude(%v), clear exclude and try again!", eh, exclude)
exclude = make(map[string]struct{})
continue
}
extID = 0
if eh.storeMode == proto.NormalExtentType {
extID, err = eh.createExtent(dp)
}
if err != nil {
log.LogWarnf("allocateExtent: exclude dp[%v] for write caused by create extent failed, eh(%v) err(%v) exclude(%v)",
dp, eh, err, exclude)
eh.stream.client.dataWrapper.RemoveDataPartitionForWrite(dp.PartitionID)
dp.CheckAllHostsIsAvail(exclude)
continue
}
} else {
if dp, err = eh.stream.client.dataWrapper.GetDataPartition(eh.key.PartitionId); err != nil {
log.LogWarnf("allocateExtent: failed to get write data partition, eh(%v)", eh)
break
}
extID = int(eh.key.ExtentId)
}
if conn, err = StreamConnPool.GetConnect(dp.Hosts[0]); err != nil {
log.LogWarnf("allocateExtent: failed to create connection, eh(%v) err(%v) dp(%v) exclude(%v)",
eh, err, dp, exclude)
// If storeMode is tinyExtentType and can't create connection, we also check host status.
dp.CheckAllHostsIsAvail(exclude)
if eh.key != nil {
break
}
continue
}
// success
eh.dp = dp
eh.conn = conn
eh.extID = extID
// log.LogDebugf("ExtentHandler allocateExtent exit: eh(%v) dp(%v) extID(%v)", eh, dp, extID)
return nil
}
errmsg := fmt.Sprintf("allocateExtent failed: hit max retry limit")
if err != nil {
err = errors.Trace(err, errmsg)
} else {
err = errors.New(errmsg)
}
return err
}
func (eh *ExtentHandler) createConnection(dp *wrapper.DataPartition) (*net.TCPConn, error) {
conn, err := net.DialTimeout("tcp", dp.Hosts[0], time.Second)
if err != nil {
return nil, err
}
connect := conn.(*net.TCPConn)
// TODO unhandled error
connect.SetKeepAlive(true)
connect.SetNoDelay(true)
return connect, nil
}
func (eh *ExtentHandler) createExtent(dp *wrapper.DataPartition) (extID int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("createExtent", err, bgTime, 1)
}()
conn, err := StreamConnPool.GetConnect(dp.Hosts[0])
if err != nil {
return extID, errors.Trace(err, "createExtent: failed to create connection, eh(%v) datapartionHosts(%v)", eh, dp.Hosts[0])
}
defer func() {
if err != nil {
StreamConnPool.PutConnect(conn, true)
} else {
StreamConnPool.PutConnect(conn, false)
}
}()
p := NewCreateExtentPacket(dp, eh.inode)
if err = p.WriteToConn(conn); err != nil {
return extID, errors.Trace(err, "createExtent: failed to WriteToConn, packet(%v) datapartionHosts(%v)", p, dp.Hosts[0])
}
if err = p.ReadFromConnWithVer(conn, proto.ReadDeadlineTime*2); err != nil {
return extID, errors.Trace(err, "createExtent: failed to ReadFromConn, packet(%v) datapartionHosts(%v)", p, dp.Hosts[0])
}
if p.ResultCode != proto.OpOk {
return extID, errors.New(fmt.Sprintf("createExtent: ResultCode NOK, packet(%v) datapartionHosts(%v) ResultCode(%v)", p, dp.Hosts[0], p.GetResultMsg()))
}
extID = int(p.ExtentID)
if extID <= 0 {
return extID, errors.New(fmt.Sprintf("createExtent: illegal extID(%v) from (%v)", extID, dp.Hosts[0]))
}
return extID, nil
}
// Handler lock is held by the caller.
func (eh *ExtentHandler) flushPacket() {
if eh.packet == nil {
return
}
eh.pushToRequest(eh.packet)
eh.packet = nil
}
func (eh *ExtentHandler) pushToRequest(packet *Packet) {
// Increase before sending the packet, because inflight is used
// to determine if the handler has finished.
atomic.AddInt32(&eh.inflight, 1)
eh.request <- packet
}
func (eh *ExtentHandler) getStatus() int32 {
return atomic.LoadInt32(&eh.status)
}
func (eh *ExtentHandler) setClosed() bool {
// log.LogDebugf("action[ExtentHandler.setClosed] stack (%v)", string(debug.Stack()))
return atomic.CompareAndSwapInt32(&eh.status, ExtentStatusOpen, ExtentStatusClosed)
}
func (eh *ExtentHandler) setRecovery() bool {
// log.LogDebugf("action[ExtentHandler.setRecovery] stack (%v)", string(debug.Stack()))
return atomic.CompareAndSwapInt32(&eh.status, ExtentStatusClosed, ExtentStatusRecovery)
}
func (eh *ExtentHandler) setError() bool {
// log.LogDebugf("action[ExtentHandler.setError] stack (%v)", string(debug.Stack()))
if proto.IsHot(eh.stream.client.volumeType) {
atomic.StoreInt32(&eh.stream.status, StreamerError)
}
return atomic.CompareAndSwapInt32(&eh.status, ExtentStatusRecovery, ExtentStatusError)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"fmt"
"hash/crc32"
"net"
"strings"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
// ExtentReader defines the struct of the extent reader.
type ExtentReader struct {
inode uint64
key *proto.ExtentKey
dp *wrapper.DataPartition
followerRead bool
retryRead bool
}
// NewExtentReader returns a new extent reader.
func NewExtentReader(inode uint64, key *proto.ExtentKey, dp *wrapper.DataPartition, followerRead bool, retryRead bool) *ExtentReader {
return &ExtentReader{
inode: inode,
key: key,
dp: dp,
followerRead: followerRead,
retryRead: retryRead,
}
}
// String returns the string format of the extent reader.
func (reader *ExtentReader) String() (m string) {
return fmt.Sprintf("inode (%v) extentKey(%v)", reader.inode,
reader.key.Marshal())
}
// Read reads the extent request.
func (reader *ExtentReader) Read(req *ExtentRequest) (readBytes int, err error) {
offset := req.FileOffset - int(reader.key.FileOffset) + int(reader.key.ExtentOffset)
size := req.Size
reqPacket := NewReadPacket(reader.key, offset, size, reader.inode, req.FileOffset, reader.followerRead)
sc := NewStreamConn(reader.dp, reader.followerRead)
log.LogDebugf("ExtentReader Read enter: size(%v) req(%v) reqPacket(%v)", size, req, reqPacket)
err = sc.Send(&reader.retryRead, reqPacket, func(conn *net.TCPConn) (error, bool) {
readBytes = 0
for readBytes < size {
replyPacket := NewReply(reqPacket.ReqID, reader.dp.PartitionID, reqPacket.ExtentID)
bufSize := util.Min(util.ReadBlockSize, size-readBytes)
replyPacket.Data = req.Data[readBytes : readBytes+bufSize]
e := replyPacket.readFromConn(conn, proto.ReadDeadlineTime)
if e != nil {
log.LogWarnf("Extent Reader Read: failed to read from connect, ino(%v) req(%v) readBytes(%v) err(%v)", reader.inode, reqPacket, readBytes, e)
// Upon receiving TryOtherAddrError, other hosts will be retried.
return TryOtherAddrError, false
}
if replyPacket.ResultCode == proto.OpAgain {
return nil, true
}
e = reader.checkStreamReply(reqPacket, replyPacket)
if e != nil {
log.LogWarnf("checkStreamReply failed:(%v) reply msg:(%v)", e, replyPacket.GetResultMsg())
// Dont change the error message, since the caller will
// check if it is NotLeaderErr.
return e, false
}
readBytes += int(replyPacket.Size)
}
return nil, false
})
if err != nil {
// if cold vol and cach is invaild
if !reader.retryRead && (err == TryOtherAddrError || strings.Contains(err.Error(), "ExistErr")) {
log.LogWarnf("Extent Reader Read: err(%v) req(%v) reqPacket(%v)", err, req, reqPacket)
} else {
log.LogErrorf("Extent Reader Read: err(%v) req(%v) reqPacket(%v)", err, req, reqPacket)
}
}
log.LogDebugf("ExtentReader Read exit: req(%v) reqPacket(%v) readBytes(%v) err(%v)", req, reqPacket, readBytes, err)
return
}
func (reader *ExtentReader) checkStreamReply(request *Packet, reply *Packet) (err error) {
if reply.ResultCode == proto.OpTryOtherAddr {
return TryOtherAddrError
}
if reply.ResultCode != proto.OpOk {
if request.Opcode == proto.OpStreamFollowerRead {
log.LogWarnf("checkStreamReply: ResultCode(%v) NOK, OpStreamFollowerRead return TryOtherAddrError, "+
"req(%v) reply(%v)", reply.GetResultMsg(), request, reply)
return TryOtherAddrError
}
err = errors.New(fmt.Sprintf("checkStreamReply: ResultCode(%v) NOK", reply.GetResultMsg()))
return
}
if !request.isValidReadReply(reply) {
err = errors.New(fmt.Sprintf("checkStreamReply: inconsistent req and reply, req(%v) reply(%v)", request, reply))
return
}
expectCrc := crc32.ChecksumIEEE(reply.Data[:reply.Size])
if reply.CRC != expectCrc {
err = errors.New(fmt.Sprintf("checkStreamReply: inconsistent CRC, expectCRC(%v) replyCRC(%v)", expectCrc, reply.CRC))
return
}
return nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"net"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/util"
)
// Packet defines a wrapper of the packet in proto.
type Packet struct {
proto.Packet
inode uint64
errCount int
}
// String returns the string format of the packet.
func (p *Packet) String() string {
return fmt.Sprintf("ReqID(%v)Op(%v)Inode(%v)FileOffset(%v)Size(%v)PartitionID(%v)ExtentID(%v)ExtentOffset(%v)CRC(%v)ResultCode(%v:%v)Seq(%v)",
p.ReqID, p.GetOpMsg(), p.inode, p.KernelOffset, p.Size, p.PartitionID, p.ExtentID, p.ExtentOffset, p.CRC, p.ResultCode, p.GetResultMsg(), p.VerSeq)
}
func NewWriteTinyDirectly(inode uint64, dpID uint64, offset int, dp *wrapper.DataPartition) *Packet {
reqPacket := NewWritePacket(inode, offset, proto.TinyExtentType)
reqPacket.PartitionID = dpID
reqPacket.RemainingFollowers = uint8(len(dp.Hosts) - 1)
reqPacket.Arg = ([]byte)(dp.GetAllAddrs())
reqPacket.ArgLen = uint32(len(reqPacket.Arg))
if len(dp.Hosts) == 1 {
reqPacket.RemainingFollowers = 127
}
return reqPacket
}
// NewWritePacket returns a new write packet.
func NewWritePacket(inode uint64, fileOffset, storeMode int) *Packet {
p := new(Packet)
p.ReqID = proto.GenerateRequestID()
p.Magic = proto.ProtoMagic
p.Opcode = proto.OpWrite
p.inode = inode
p.KernelOffset = uint64(fileOffset)
if storeMode == proto.TinyExtentType {
p.Data, _ = proto.Buffers.Get(util.DefaultTinySizeLimit)
} else {
p.Data, _ = proto.Buffers.Get(util.BlockSize)
}
return p
}
// NewOverwritePacket returns a new overwrite packet.
func NewOverwriteByAppendPacket(dp *wrapper.DataPartition, extentID uint64, extentOffset int,
inode uint64, fileOffset int, direct bool, op uint8) *Packet {
p := new(Packet)
p.PartitionID = dp.PartitionID
p.Magic = proto.ProtoMagic
p.ExtentType = proto.NormalExtentType
p.ExtentID = extentID
p.ExtentOffset = int64(extentOffset)
p.ReqID = proto.GenerateRequestID()
p.Arg = nil
p.ArgLen = 0
p.RemainingFollowers = 0
p.Opcode = op
if direct {
if op == proto.OpRandomWriteAppend {
p.Opcode = proto.OpSyncRandomWriteAppend
} else if op == proto.OpTryWriteAppend {
p.Opcode = proto.OpSyncTryWriteAppend
}
}
p.inode = inode
p.KernelOffset = uint64(fileOffset)
p.Data, _ = proto.Buffers.Get(util.BlockSize)
return p
}
// NewOverwritePacket returns a new overwrite packet.
func NewOverwritePacket(dp *wrapper.DataPartition, extentID uint64, extentOffset int, inode uint64, fileOffset int) *Packet {
p := new(Packet)
p.PartitionID = dp.PartitionID
p.Magic = proto.ProtoMagic
p.ExtentType = proto.NormalExtentType
p.ExtentID = extentID
p.ExtentOffset = int64(extentOffset)
p.ReqID = proto.GenerateRequestID()
p.Arg = nil
p.ArgLen = 0
p.RemainingFollowers = 0
p.Opcode = proto.OpRandomWriteVer // proto.OpRandomWrite
p.inode = inode
p.KernelOffset = uint64(fileOffset)
p.Data, _ = proto.Buffers.Get(util.BlockSize)
return p
}
// NewReadPacket returns a new read packet.
func NewReadPacket(key *proto.ExtentKey, extentOffset, size int, inode uint64, fileOffset int, followerRead bool) *Packet {
p := new(Packet)
p.ExtentID = key.ExtentId
p.PartitionID = key.PartitionId
p.Magic = proto.ProtoMagic
p.ExtentOffset = int64(extentOffset)
p.Size = uint32(size)
if followerRead {
p.Opcode = proto.OpStreamFollowerRead
} else {
p.Opcode = proto.OpStreamRead
}
p.ExtentType = proto.NormalExtentType
p.ReqID = proto.GenerateRequestID()
p.RemainingFollowers = 0
p.inode = inode
p.KernelOffset = uint64(fileOffset)
return p
}
// NewCreateExtentPacket returns a new packet to create extent.
func NewCreateExtentPacket(dp *wrapper.DataPartition, inode uint64) *Packet {
p := new(Packet)
p.PartitionID = dp.PartitionID
p.Magic = proto.ProtoMagic
p.ExtentType = proto.NormalExtentType
p.Arg = ([]byte)(dp.GetAllAddrs())
p.ArgLen = uint32(len(p.Arg))
p.RemainingFollowers = uint8(len(dp.Hosts) - 1)
if len(dp.Hosts) == 1 {
p.RemainingFollowers = 127
}
p.ReqID = proto.GenerateRequestID()
p.Opcode = proto.OpCreateExtent
p.Data = make([]byte, 8)
binary.BigEndian.PutUint64(p.Data, inode)
p.Size = uint32(len(p.Data))
return p
}
// NewReply returns a new reply packet. TODO rename to NewReplyPacket?
func NewReply(reqID int64, partitionID uint64, extentID uint64) *Packet {
p := new(Packet)
p.ReqID = reqID
p.PartitionID = partitionID
p.ExtentID = extentID
p.Magic = proto.ProtoMagic
p.ExtentType = proto.NormalExtentType
return p
}
func (p *Packet) isValidWriteReply(q *Packet) bool {
if p.ReqID == q.ReqID && p.PartitionID == q.PartitionID {
return true
}
return false
}
func (p *Packet) isValidReadReply(q *Packet) bool {
if p.ReqID == q.ReqID && p.PartitionID == q.PartitionID && p.ExtentID == q.ExtentID {
return true
}
return false
}
func (p *Packet) writeToConn(conn net.Conn) error {
p.CRC = crc32.ChecksumIEEE(p.Data[:p.Size])
return p.WriteToConn(conn)
}
func (p *Packet) readFromConn(c net.Conn, deadlineTime time.Duration) (err error) {
if deadlineTime != proto.NoReadDeadlineTime {
c.SetReadDeadline(time.Now().Add(deadlineTime * time.Second))
}
header, _ := proto.Buffers.Get(util.PacketHeaderSize)
defer proto.Buffers.Put(header)
if _, err = io.ReadFull(c, header); err != nil {
return
}
if err = p.UnmarshalHeader(header); err != nil {
return
}
if p.ArgLen > 0 {
if err = readToBuffer(c, &p.Arg, int(p.ArgLen)); err != nil {
return
}
}
if p.Size < 0 {
return
}
size := int(p.Size)
if size > len(p.Data) {
size = len(p.Data)
}
_, err = io.ReadFull(c, p.Data[:size])
return
}
func readToBuffer(c net.Conn, buf *[]byte, readSize int) (err error) {
if *buf == nil || readSize != util.BlockSize {
*buf = make([]byte, readSize)
}
_, err = io.ReadFull(c, (*buf)[:readSize])
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"fmt"
"net"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
var (
TryOtherAddrError = errors.New("TryOtherAddrError")
DpDiscardError = errors.New("DpDiscardError")
)
const (
StreamSendMaxRetry = 200
StreamSendSleepInterval = 100 * time.Millisecond
)
type GetReplyFunc func(conn *net.TCPConn) (err error, again bool)
// StreamConn defines the struct of the stream connection.
type StreamConn struct {
dp *wrapper.DataPartition
currAddr string
}
var StreamConnPool = util.NewConnectPool()
// NewStreamConn returns a new stream connection.
func NewStreamConn(dp *wrapper.DataPartition, follower bool) (sc *StreamConn) {
if !follower {
sc = &StreamConn{
dp: dp,
currAddr: dp.LeaderAddr,
}
return
}
defer func() {
if sc.currAddr == "" {
/*
* If followerRead is enabled, and there is no preferred choice,
* currAddr can be arbitrarily selected from the hosts.
*/
for _, h := range dp.Hosts {
if h != "" {
sc.currAddr = h
break
}
}
}
}()
if dp.ClientWrapper.NearRead() {
sc = &StreamConn{
dp: dp,
currAddr: getNearestHost(dp),
}
return
}
epoch := atomic.AddUint64(&dp.Epoch, 1)
hosts := sortByStatus(dp, false)
choice := len(hosts)
currAddr := dp.LeaderAddr
if choice > 0 {
index := int(epoch) % choice
currAddr = hosts[index]
}
sc = &StreamConn{
dp: dp,
currAddr: currAddr,
}
return
}
// String returns the string format of the stream connection.
func (sc *StreamConn) String() string {
return fmt.Sprintf("Partition(%v) CurrentAddr(%v) Hosts(%v)", sc.dp.PartitionID, sc.currAddr, sc.dp.Hosts)
}
// Send send the given packet over the network through the stream connection until success
// or the maximum number of retries is reached.
func (sc *StreamConn) Send(retry *bool, req *Packet, getReply GetReplyFunc) (err error) {
for i := 0; i < StreamSendMaxRetry; i++ {
err = sc.sendToDataPartition(req, retry, getReply)
if err == nil || err == proto.ErrCodeVersionOp || !*retry || err == TryOtherAddrError {
return
}
log.LogWarnf("StreamConn Send: err(%v)", err)
time.Sleep(StreamSendSleepInterval)
}
return errors.New(fmt.Sprintf("StreamConn Send: retried %v times and still failed, sc(%v) reqPacket(%v)", StreamSendMaxRetry, sc, req))
}
func (sc *StreamConn) sendToDataPartition(req *Packet, retry *bool, getReply GetReplyFunc) (err error) {
conn, err := StreamConnPool.GetConnect(sc.currAddr)
if err == nil {
log.LogDebugf("req opcode %v, conn %v", req.Opcode, conn)
err = sc.sendToConn(conn, req, getReply)
if err == nil {
StreamConnPool.PutConnect(conn, false)
return
}
log.LogWarnf("sendToDataPartition: send to curr addr failed, addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
StreamConnPool.PutConnect(conn, true)
if err != TryOtherAddrError || !*retry {
return
}
} else {
log.LogWarnf("sendToDataPartition: get connection to curr addr failed, addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
}
hosts := sortByStatus(sc.dp, true)
for _, addr := range hosts {
log.LogWarnf("sendToDataPartition: try addr(%v) reqPacket(%v)", addr, req)
conn, err = StreamConnPool.GetConnect(addr)
if err != nil {
log.LogWarnf("sendToDataPartition: failed to get connection to addr(%v) reqPacket(%v) err(%v)", addr, req, err)
continue
}
sc.currAddr = addr
sc.dp.LeaderAddr = addr
err = sc.sendToConn(conn, req, getReply)
if err == nil {
StreamConnPool.PutConnect(conn, false)
return
}
StreamConnPool.PutConnect(conn, true)
if err != TryOtherAddrError {
return
}
log.LogWarnf("sendToDataPartition: try addr(%v) failed! reqPacket(%v) err(%v)", addr, req, err)
}
return errors.New(fmt.Sprintf("sendToPatition Failed: sc(%v) reqPacket(%v)", sc, req))
}
func (sc *StreamConn) sendToConn(conn *net.TCPConn, req *Packet, getReply GetReplyFunc) (err error) {
for i := 0; i < StreamSendMaxRetry; i++ {
log.LogDebugf("sendToConn: send to addr(%v), reqPacket(%v)", sc.currAddr, req)
err = req.WriteToConn(conn)
if err != nil {
msg := fmt.Sprintf("sendToConn: failed to write to addr(%v) err(%v)", sc.currAddr, err)
log.LogWarn(msg)
break
}
var again bool
err, again = getReply(conn)
if !again {
if err != nil {
log.LogWarnf("sendToConn: getReply error and RETURN, addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
}
break
}
log.LogWarnf("sendToConn: getReply error and will RETRY, sc(%v) err(%v)", sc, err)
time.Sleep(StreamSendSleepInterval)
}
log.LogDebugf("sendToConn exit: send to addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
return
}
// sortByStatus will return hosts list sort by host status for DataPartition.
// If param selectAll is true, hosts with status(true) is in front and hosts with status(false) is in behind.
// If param selectAll is false, only return hosts with status(true).
func sortByStatus(dp *wrapper.DataPartition, selectAll bool) (hosts []string) {
var failedHosts []string
hostsStatus := dp.ClientWrapper.HostsStatus
var dpHosts []string
if dp.ClientWrapper.FollowerRead() && dp.ClientWrapper.NearRead() {
dpHosts = dp.NearHosts
if len(dpHosts) == 0 {
dpHosts = dp.Hosts
}
} else {
dpHosts = dp.Hosts
}
for _, addr := range dpHosts {
status, ok := hostsStatus[addr]
if ok {
if status {
hosts = append(hosts, addr)
} else {
failedHosts = append(failedHosts, addr)
}
} else {
failedHosts = append(failedHosts, addr)
log.LogWarnf("sortByStatus: can not find host[%v] in HostsStatus, dp[%d]", addr, dp.PartitionID)
}
}
if selectAll {
hosts = append(hosts, failedHosts...)
}
return
}
func getNearestHost(dp *wrapper.DataPartition) string {
hostsStatus := dp.ClientWrapper.HostsStatus
for _, addr := range dp.NearHosts {
status, ok := hostsStatus[addr]
if ok {
if !status {
continue
}
}
return addr
}
return dp.LeaderAddr
}
func NewStreamConnByHost(host string) *StreamConn {
return &StreamConn{
currAddr: host,
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"context"
"fmt"
"io"
"sync"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/blockcache/bcache"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/buf"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
)
// One inode corresponds to one streamer. All the requests to the same inode will be queued.
// TODO rename streamer here is not a good name as it also handles overwrites, not just stream write.
type Streamer struct {
client *ExtentClient
inode uint64
parentInode uint64
status int32
refcnt int
idle int // how long there is no new request
traversed int // how many times the streamer is traversed
extents *ExtentCache
once sync.Once
handler *ExtentHandler // current open handler
dirtylist *DirtyExtentList // dirty handlers
dirty bool // whether current open handler is in the dirty list
isOpen bool
needBCache bool
request chan interface{} // request channel, write/flush/close
done chan struct{} // stream writer is being closed
writeLock sync.Mutex
inflightEvictL1cache sync.Map
pendingCache chan bcacheKey
verSeq uint64
needUpdateVer int32
}
type bcacheKey struct {
cacheKey string
extentKey *proto.ExtentKey
}
// NewStreamer returns a new streamer.
func NewStreamer(client *ExtentClient, inode uint64) *Streamer {
s := new(Streamer)
s.client = client
s.inode = inode
s.parentInode = 0
s.extents = NewExtentCache(inode)
s.request = make(chan interface{}, 64)
s.done = make(chan struct{})
s.dirtylist = NewDirtyExtentList()
s.isOpen = true
s.pendingCache = make(chan bcacheKey, 1)
s.verSeq = client.multiVerMgr.latestVerSeq
s.extents.verSeq = client.multiVerMgr.latestVerSeq
go s.server()
go s.asyncBlockCache()
return s
}
func (s *Streamer) SetParentInode(inode uint64) {
s.parentInode = inode
}
// String returns the string format of the streamer.
func (s *Streamer) String() string {
return fmt.Sprintf("Streamer{ino(%v)}", s.inode)
}
// TODO should we call it RefreshExtents instead?
func (s *Streamer) GetExtents() error {
if s.client.disableMetaCache || !s.needBCache {
return s.extents.RefreshForce(s.inode, s.client.getExtents)
}
return s.extents.Refresh(s.inode, s.client.getExtents)
}
func (s *Streamer) GetExtentsForce() error {
return s.extents.RefreshForce(s.inode, s.client.getExtents)
}
// GetExtentReader returns the extent reader.
// TODO: use memory pool
func (s *Streamer) GetExtentReader(ek *proto.ExtentKey) (*ExtentReader, error) {
partition, err := s.client.dataWrapper.GetDataPartition(ek.PartitionId)
if err != nil {
return nil, err
}
if partition.IsDiscard {
log.LogWarnf("GetExtentReader: datapartition %v is discard", partition.PartitionID)
return nil, DpDiscardError
}
retryRead := true
if proto.IsCold(s.client.volumeType) {
retryRead = false
}
reader := NewExtentReader(s.inode, ek, partition, s.client.dataWrapper.FollowerRead(), retryRead)
return reader, nil
}
func (s *Streamer) read(data []byte, offset int, size int) (total int, err error) {
var (
readBytes int
reader *ExtentReader
requests []*ExtentRequest
revisedRequests []*ExtentRequest
)
log.LogDebugf("action[streamer.read] offset %v size %v", offset, size)
ctx := context.Background()
s.client.readLimiter.Wait(ctx)
s.client.LimitManager.ReadAlloc(ctx, size)
requests = s.extents.PrepareReadRequests(offset, size, data)
for _, req := range requests {
if req.ExtentKey == nil {
continue
}
if req.ExtentKey.PartitionId == 0 || req.ExtentKey.ExtentId == 0 {
s.writeLock.Lock()
if err = s.IssueFlushRequest(); err != nil {
s.writeLock.Unlock()
return 0, err
}
revisedRequests = s.extents.PrepareReadRequests(offset, size, data)
s.writeLock.Unlock()
break
}
}
if revisedRequests != nil {
requests = revisedRequests
}
filesize, _ := s.extents.Size()
log.LogDebugf("read: ino(%v) requests(%v) filesize(%v)", s.inode, requests, filesize)
for _, req := range requests {
log.LogDebugf("action[streamer.read] req %v", req)
if req.ExtentKey == nil {
zeros := make([]byte, len(req.Data))
copy(req.Data, zeros)
if req.FileOffset+req.Size > filesize {
if req.FileOffset > filesize {
return
}
req.Size = filesize - req.FileOffset
total += req.Size
err = io.EOF
return
}
// Reading a hole, just fill zero
total += req.Size
log.LogDebugf("Stream read hole: ino(%v) req(%v) total(%v)", s.inode, req, total)
} else {
log.LogDebugf("Stream read: ino(%v) req(%v) s.needBCache(%v) s.client.bcacheEnable(%v)", s.inode, req, s.needBCache, s.client.bcacheEnable)
if s.needBCache {
bcacheMetric := exporter.NewCounter("fileReadL1Cache")
bcacheMetric.AddWithLabels(1, map[string]string{exporter.Vol: s.client.volumeName})
}
// skip hole,ek is not nil,read block cache firstly
log.LogDebugf("Stream read: ino(%v) req(%v) s.client.bcacheEnable(%v) s.needBCache(%v)", s.inode, req, s.client.bcacheEnable, s.needBCache)
cacheKey := util.GenerateRepVolKey(s.client.volumeName, s.inode, req.ExtentKey.PartitionId, req.ExtentKey.ExtentId, req.ExtentKey.FileOffset)
if s.client.bcacheEnable && s.needBCache && filesize <= bcache.MaxFileSize {
offset := req.FileOffset - int(req.ExtentKey.FileOffset)
if s.client.loadBcache != nil {
readBytes, err = s.client.loadBcache(cacheKey, req.Data, uint64(offset), uint32(req.Size))
if err == nil && readBytes == req.Size {
total += req.Size
bcacheMetric := exporter.NewCounter("fileReadL1CacheHit")
bcacheMetric.AddWithLabels(1, map[string]string{exporter.Vol: s.client.volumeName})
log.LogDebugf("TRACE Stream read. hit blockCache: ino(%v) cacheKey(%v) readBytes(%v) err(%v)", s.inode, cacheKey, readBytes, err)
continue
}
}
log.LogDebugf("TRACE Stream read. miss blockCache cacheKey(%v) loadBcache(%v)", cacheKey, s.client.loadBcache)
}
if s.needBCache {
bcacheMetric := exporter.NewCounter("fileReadL1CacheMiss")
bcacheMetric.AddWithLabels(1, map[string]string{exporter.Vol: s.client.volumeName})
}
// read extent
reader, err = s.GetExtentReader(req.ExtentKey)
if err != nil {
log.LogErrorf("action[streamer.read] req %v err %v", req, err)
break
}
if s.client.bcacheEnable && s.needBCache && filesize <= bcache.MaxFileSize {
// limit big block cache
if s.exceedBlockSize(req.ExtentKey.Size) && atomic.LoadInt32(&s.client.inflightL1BigBlock) > 10 {
// do nothing
} else {
select {
case s.pendingCache <- bcacheKey{cacheKey: cacheKey, extentKey: req.ExtentKey}:
if s.exceedBlockSize(req.ExtentKey.Size) {
atomic.AddInt32(&s.client.inflightL1BigBlock, 1)
}
default:
}
}
}
readBytes, err = reader.Read(req)
log.LogDebugf("TRACE Stream read: ino(%v) req(%v) readBytes(%v) err(%v)", s.inode, req, readBytes, err)
total += readBytes
if err != nil || readBytes < req.Size {
if total == 0 {
log.LogErrorf("Stream read: ino(%v) req(%v) readBytes(%v) err(%v)", s.inode, req, readBytes, err)
}
break
}
}
}
log.LogDebugf("action[streamer.read] offset %v size %v exit", offset, size)
return
}
func (s *Streamer) asyncBlockCache() {
if !s.needBCache || !s.isOpen {
return
}
t := time.NewTicker(3 * time.Second)
defer t.Stop()
for {
select {
case pending := <-s.pendingCache:
ek := pending.extentKey
cacheKey := pending.cacheKey
log.LogDebugf("asyncBlockCache: cacheKey=(%v) ek=(%v)", cacheKey, ek)
// read full extent
var data []byte
if ek.Size == bcache.MaxBlockSize {
data = buf.BCachePool.Get()
} else {
data = make([]byte, ek.Size)
}
reader, err := s.GetExtentReader(ek)
fullReq := NewExtentRequest(int(ek.FileOffset), int(ek.Size), data, ek)
readBytes, err := reader.Read(fullReq)
if err != nil || readBytes != len(data) {
log.LogWarnf("asyncBlockCache: Stream read full extent error. fullReq(%v) readBytes(%v) err(%v)", fullReq, readBytes, err)
if ek.Size == bcache.MaxBlockSize {
buf.BCachePool.Put(data)
}
if s.exceedBlockSize(ek.Size) {
atomic.AddInt32(&s.client.inflightL1BigBlock, -1)
}
return
}
if s.client.cacheBcache != nil {
log.LogDebugf("TRACE read. write blockCache cacheKey(%v) len_buf(%v),", cacheKey, len(data))
s.client.cacheBcache(cacheKey, data)
}
if ek.Size == bcache.MaxBlockSize {
buf.BCachePool.Put(data)
}
if s.exceedBlockSize(ek.Size) {
atomic.AddInt32(&s.client.inflightL1BigBlock, -1)
}
case <-t.C:
if s.refcnt <= 0 {
s.isOpen = false
return
}
}
}
}
func (s *Streamer) exceedBlockSize(size uint32) bool {
if size > bcache.BigExtentSize {
return true
}
return false
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stream
import (
"context"
"fmt"
"hash/crc32"
"net"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/data/wrapper"
"github.com/cubefs/cubefs/storage"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
MaxSelectDataPartitionForWrite = 32
MaxNewHandlerRetry = 3
MaxPacketErrorCount = 128
MaxDirtyListLen = 0
)
const (
StreamerNormal int32 = iota
StreamerError
LastEKVersionNotEqual
)
const (
streamWriterFlushPeriod = 3
streamWriterIdleTimeoutPeriod = 10
)
// VerUpdateRequest defines an verseq update request.
type VerUpdateRequest struct {
err error
verSeq uint64
done chan struct{}
}
// OpenRequest defines an open request.
type OpenRequest struct {
done chan struct{}
}
// WriteRequest defines a write request.
type WriteRequest struct {
fileOffset int
size int
data []byte
flags int
writeBytes int
err error
done chan struct{}
checkFunc func() error
}
// FlushRequest defines a flush request.
type FlushRequest struct {
err error
done chan struct{}
}
// ReleaseRequest defines a release request.
type ReleaseRequest struct {
err error
done chan struct{}
}
// TruncRequest defines a truncate request.
type TruncRequest struct {
size int
err error
fullPath string
done chan struct{}
}
// EvictRequest defines an evict request.
type EvictRequest struct {
err error
done chan struct{}
}
// Open request shall grab the lock until request is sent to the request channel
func (s *Streamer) IssueOpenRequest() error {
request := openRequestPool.Get().(*OpenRequest)
request.done = make(chan struct{}, 1)
s.request <- request
s.client.streamerLock.Unlock()
<-request.done
openRequestPool.Put(request)
return nil
}
func (s *Streamer) IssueWriteRequest(offset int, data []byte, flags int, checkFunc func() error) (write int, err error) {
if atomic.LoadInt32(&s.status) >= StreamerError {
return 0, errors.New(fmt.Sprintf("IssueWriteRequest: stream writer in error status, ino(%v)", s.inode))
}
s.writeLock.Lock()
request := writeRequestPool.Get().(*WriteRequest)
request.data = data
request.fileOffset = offset
request.size = len(data)
request.flags = flags
request.done = make(chan struct{}, 1)
request.checkFunc = checkFunc
s.request <- request
s.writeLock.Unlock()
<-request.done
err = request.err
write = request.writeBytes
writeRequestPool.Put(request)
return
}
func (s *Streamer) IssueFlushRequest() error {
request := flushRequestPool.Get().(*FlushRequest)
request.done = make(chan struct{}, 1)
s.request <- request
<-request.done
err := request.err
flushRequestPool.Put(request)
return err
}
func (s *Streamer) IssueReleaseRequest() error {
request := releaseRequestPool.Get().(*ReleaseRequest)
request.done = make(chan struct{}, 1)
s.request <- request
s.client.streamerLock.Unlock()
<-request.done
err := request.err
releaseRequestPool.Put(request)
return err
}
func (s *Streamer) IssueTruncRequest(size int, fullPath string) error {
request := truncRequestPool.Get().(*TruncRequest)
request.size = size
request.fullPath = fullPath
request.done = make(chan struct{}, 1)
s.request <- request
<-request.done
err := request.err
truncRequestPool.Put(request)
return err
}
func (s *Streamer) IssueEvictRequest() error {
request := evictRequestPool.Get().(*EvictRequest)
request.done = make(chan struct{}, 1)
s.request <- request
s.client.streamerLock.Unlock()
<-request.done
err := request.err
evictRequestPool.Put(request)
return err
}
func (s *Streamer) GetStoreMod(offset int, size int) (storeMode int) {
// Small files are usually written in a single write, so use tiny extent
// store only for the first write operation.
if offset > 0 || offset+size > s.tinySizeLimit() {
storeMode = proto.NormalExtentType
} else {
storeMode = proto.TinyExtentType
}
return
}
func (s *Streamer) server() {
t := time.NewTicker(2 * time.Second)
defer t.Stop()
for {
select {
case request := <-s.request:
s.handleRequest(request)
s.idle = 0
s.traversed = 0
case <-s.done:
s.abort()
log.LogDebugf("done server: evict, ino(%v)", s.inode)
return
case <-t.C:
s.traverse()
if s.refcnt <= 0 {
s.client.streamerLock.Lock()
if s.idle >= streamWriterIdleTimeoutPeriod && len(s.request) == 0 {
if s.client.disableMetaCache || !s.needBCache {
delete(s.client.streamers, s.inode)
if s.client.evictIcache != nil {
s.client.evictIcache(s.inode)
}
}
s.isOpen = false
// fail the remaining requests in such case
s.clearRequests()
s.client.streamerLock.Unlock()
log.LogDebugf("done server: no requests for a long time, ino(%v)", s.inode)
return
}
s.client.streamerLock.Unlock()
s.idle++
}
}
}
}
func (s *Streamer) clearRequests() {
for {
select {
case request := <-s.request:
s.abortRequest(request)
default:
return
}
}
}
func (s *Streamer) abortRequest(request interface{}) {
switch request := request.(type) {
case *OpenRequest:
request.done <- struct{}{}
case *WriteRequest:
request.err = syscall.EAGAIN
request.done <- struct{}{}
case *TruncRequest:
request.err = syscall.EAGAIN
request.done <- struct{}{}
case *FlushRequest:
request.err = syscall.EAGAIN
request.done <- struct{}{}
case *ReleaseRequest:
request.err = syscall.EAGAIN
request.done <- struct{}{}
case *EvictRequest:
request.err = syscall.EAGAIN
request.done <- struct{}{}
default:
}
}
func (s *Streamer) handleRequest(request interface{}) {
if atomic.LoadInt32(&s.needUpdateVer) == 1 {
s.closeOpenHandler()
atomic.StoreInt32(&s.needUpdateVer, 0)
}
switch request := request.(type) {
case *OpenRequest:
s.open()
request.done <- struct{}{}
case *WriteRequest:
request.writeBytes, request.err = s.write(request.data, request.fileOffset, request.size, request.flags, request.checkFunc)
request.done <- struct{}{}
case *TruncRequest:
request.err = s.truncate(request.size, request.fullPath)
request.done <- struct{}{}
case *FlushRequest:
request.err = s.flush()
request.done <- struct{}{}
case *ReleaseRequest:
request.err = s.release()
request.done <- struct{}{}
case *EvictRequest:
request.err = s.evict()
request.done <- struct{}{}
case *VerUpdateRequest:
request.err = s.updateVer(request.verSeq)
request.done <- struct{}{}
default:
}
}
func (s *Streamer) write(data []byte, offset, size, flags int, checkFunc func() error) (total int, err error) {
var (
direct bool
retryTimes int8
)
if flags&proto.FlagsSyncWrite != 0 {
direct = true
}
begin:
if flags&proto.FlagsAppend != 0 {
filesize, _ := s.extents.Size()
offset = filesize
}
log.LogDebugf("Streamer write enter: ino(%v) offset(%v) size(%v) flags(%v)", s.inode, offset, size, flags)
ctx := context.Background()
s.client.writeLimiter.Wait(ctx)
requests := s.extents.PrepareWriteRequests(offset, size, data)
log.LogDebugf("Streamer write: ino(%v) prepared requests(%v)", s.inode, requests)
isChecked := false
// Must flush before doing overwrite
for _, req := range requests {
if req.ExtentKey == nil {
continue
}
err = s.flush()
if err != nil {
return
}
// some extent key in requests with partition id 0 means it's append operation and on flight.
// need to flush and get the right key then used to make modification
requests = s.extents.PrepareWriteRequests(offset, size, data)
log.LogDebugf("Streamer write: ino(%v) prepared requests after flush(%v)", s.inode, requests)
break
}
for _, req := range requests {
var writeSize int
if req.ExtentKey != nil {
if s.client.bcacheEnable {
cacheKey := util.GenerateRepVolKey(s.client.volumeName, s.inode, req.ExtentKey.PartitionId, req.ExtentKey.ExtentId, uint64(req.FileOffset))
if _, ok := s.inflightEvictL1cache.Load(cacheKey); !ok {
go func(cacheKey string) {
s.inflightEvictL1cache.Store(cacheKey, true)
s.client.evictBcache(cacheKey)
s.inflightEvictL1cache.Delete(cacheKey)
}(cacheKey)
}
}
log.LogDebugf("action[streamer.write] inode [%v] latest seq [%v] extentkey seq [%v] info [%v] before compare seq",
s.inode, s.verSeq, req.ExtentKey.GetSeq(), req.ExtentKey)
if req.ExtentKey.GetSeq() == s.verSeq {
writeSize, err = s.doOverwrite(req, direct)
if err == proto.ErrCodeVersionOp {
log.LogDebugf("action[streamer.write] write need version update")
if err = s.GetExtentsForce(); err != nil {
log.LogErrorf("action[streamer.write] err %v", err)
return
}
if retryTimes > 3 {
err = proto.ErrCodeVersionOp
log.LogWarnf("action[streamer.write] err %v", err)
return
}
time.Sleep(time.Millisecond * 100)
retryTimes++
log.LogDebugf("action[streamer.write] err %v retryTimes %v", err, retryTimes)
goto begin
}
log.LogDebugf("action[streamer.write] err %v retryTimes %v", err, retryTimes)
} else {
log.LogDebugf("action[streamer.write] ino %v do OverWriteByAppend extent key (%v) because seq not equal", s.inode, req.ExtentKey)
writeSize, _, err, _ = s.doOverWriteByAppend(req, direct)
}
if s.client.bcacheEnable {
cacheKey := util.GenerateKey(s.client.volumeName, s.inode, uint64(req.FileOffset))
go s.client.evictBcache(cacheKey)
}
} else {
if !isChecked && checkFunc != nil {
isChecked = true
if err = checkFunc(); err != nil {
return
}
}
writeSize, err = s.doWriteAppend(req, direct)
}
if err != nil {
log.LogErrorf("Streamer write: ino(%v) err(%v)", s.inode, err)
break
}
total += writeSize
}
if filesize, _ := s.extents.Size(); offset+total > filesize {
s.extents.SetSize(uint64(offset+total), false)
log.LogDebugf("Streamer write: ino(%v) filesize changed to (%v)", s.inode, offset+total)
}
log.LogDebugf("Streamer write exit: ino(%v) offset(%v) size(%v) done total(%v) err(%v)", s.inode, offset, size, total, err)
return
}
func (s *Streamer) doOverWriteByAppend(req *ExtentRequest, direct bool) (total int, extKey *proto.ExtentKey, err error, status int32) {
// the extent key needs to be updated because when preparing the requests,
// the obtained extent key could be a local key which can be inconsistent with the remote key.
// the OpTryWriteAppend is a special case, ignore it
req.ExtentKey = s.extents.Get(uint64(req.FileOffset))
return s.doDirectWriteByAppend(req, direct, proto.OpRandomWriteAppend)
}
func (s *Streamer) tryDirectAppendWrite(req *ExtentRequest, direct bool) (total int, extKey *proto.ExtentKey, err error, status int32) {
req.ExtentKey = s.handler.key
return s.doDirectWriteByAppend(req, direct, proto.OpTryWriteAppend)
}
func (s *Streamer) doDirectWriteByAppend(req *ExtentRequest, direct bool, op uint8) (total int, extKey *proto.ExtentKey, err error, status int32) {
var (
dp *wrapper.DataPartition
reqPacket *Packet
)
log.LogDebugf("action[doDirectWriteByAppend] inode %v enter in req %v", s.inode, req)
err = s.flush()
if err != nil {
return
}
if req.ExtentKey == nil {
err = errors.New(fmt.Sprintf("doOverwrite: extent key not exist, ino(%v) ekFileOffset(%v) ek(%v)", s.inode, req.FileOffset, req.ExtentKey))
return
}
if dp, err = s.client.dataWrapper.GetDataPartition(req.ExtentKey.PartitionId); err != nil {
// TODO unhandled error
errors.Trace(err, "doDirectWriteByAppend: ino(%v) failed to get datapartition, ek(%v)", s.inode, req.ExtentKey)
return
}
retry := true
if proto.IsCold(s.client.volumeType) {
retry = false
}
log.LogDebugf("action[doDirectWriteByAppend] inode %v data process", s.inode)
addr := dp.LeaderAddr
if storage.IsTinyExtent(req.ExtentKey.ExtentId) {
addr = dp.Hosts[0]
reqPacket = NewWriteTinyDirectly(s.inode, req.ExtentKey.PartitionId, req.FileOffset, dp)
} else {
reqPacket = NewOverwriteByAppendPacket(dp, req.ExtentKey.ExtentId, int(req.ExtentKey.ExtentOffset)+int(req.ExtentKey.Size),
s.inode, req.FileOffset, direct, op)
}
sc := &StreamConn{
dp: dp,
currAddr: addr,
}
replyPacket := new(Packet)
if req.Size > util.BlockSize {
log.LogErrorf("action[doDirectWriteByAppend] inode %v size too large %v", s.inode, req.Size)
panic(nil)
}
for total < req.Size { // normally should only run once due to key exist in the system must be less than BlockSize
// right position in extent:offset-ek4FileOffset+total+ekExtOffset .
// ekExtOffset will be set by replay packet at addExtentInfo(datanode)
if direct {
reqPacket.Opcode = op
}
if req.ExtentKey.ExtentId <= storage.TinyExtentCount {
reqPacket.ExtentType = proto.TinyExtentType
}
packSize := util.Min(req.Size-total, util.BlockSize)
copy(reqPacket.Data[:packSize], req.Data[total:total+packSize])
reqPacket.Size = uint32(packSize)
reqPacket.CRC = crc32.ChecksumIEEE(reqPacket.Data[:packSize])
err = sc.Send(&retry, reqPacket, func(conn *net.TCPConn) (error, bool) {
e := replyPacket.ReadFromConnWithVer(conn, proto.ReadDeadlineTime)
if e != nil {
log.LogWarnf("doDirectWriteByAppend.Stream Writer doOverwrite: ino(%v) failed to read from connect, req(%v) err(%v)", s.inode, reqPacket, e)
// Upon receiving TryOtherAddrError, other hosts will be retried.
return TryOtherAddrError, false
}
log.LogDebugf("action[doDirectWriteByAppend] .UpdateLatestVer ino(%v) get replyPacket %v", s.inode, replyPacket)
if replyPacket.VerSeq > sc.dp.ClientWrapper.SimpleClient.GetLatestVer() {
err = sc.dp.ClientWrapper.SimpleClient.UpdateLatestVer(&proto.VolVersionInfoList{VerList: replyPacket.VerList})
if err != nil {
return err, false
}
}
log.LogDebugf("action[doDirectWriteByAppend] ino(%v) get replyPacket opcode %v resultCode %v", s.inode, replyPacket.Opcode, replyPacket.ResultCode)
if replyPacket.ResultCode == proto.OpAgain {
return nil, true
}
if replyPacket.ResultCode == proto.OpTryOtherExtent {
status = int32(proto.OpTryOtherExtent)
return nil, false
}
if replyPacket.ResultCode == proto.OpTryOtherAddr {
e = TryOtherAddrError
log.LogDebugf("action[doDirectWriteByAppend] data process err %v", e)
}
return e, false
})
proto.Buffers.Put(reqPacket.Data)
reqPacket.Data = nil
log.LogDebugf("doDirectWriteByAppend: ino(%v) req(%v) reqPacket(%v) err(%v) replyPacket(%v)", s.inode, req, reqPacket, err, replyPacket)
if err != nil || replyPacket.ResultCode != proto.OpOk {
status = int32(replyPacket.ResultCode)
err = errors.New(fmt.Sprintf("doOverwrite: failed or reply NOK: err(%v) ino(%v) req(%v) replyPacket(%v)", err, s.inode, req, replyPacket))
log.LogErrorf("action[doDirectWriteByAppend] data process err %v", err)
s.handler.key = nil // direct write key cann't be used again in flush process
break
}
if !reqPacket.isValidWriteReply(replyPacket) || reqPacket.CRC != replyPacket.CRC {
err = errors.New(fmt.Sprintf("doOverwrite: is not the corresponding reply, ino(%v) req(%v) replyPacket(%v)", s.inode, req, replyPacket))
log.LogErrorf("action[doDirectWriteByAppend] data process err %v", err)
break
}
total += packSize
break
}
if err != nil {
log.LogErrorf("action[doDirectWriteByAppend] data process err %v", err)
return
}
if replyPacket.VerSeq > s.verSeq {
s.client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: replyPacket.VerList})
}
extKey = &proto.ExtentKey{
FileOffset: uint64(req.FileOffset),
PartitionId: req.ExtentKey.PartitionId,
ExtentId: replyPacket.ExtentID,
ExtentOffset: uint64(replyPacket.ExtentOffset),
Size: uint32(total),
SnapInfo: &proto.ExtSnapInfo{
VerSeq: s.verSeq,
},
}
if op == proto.OpRandomWriteAppend || op == proto.OpSyncRandomWriteAppend {
log.LogDebugf("action[doDirectWriteByAppend] inode %v local cache process start extKey %v", s.inode, extKey)
if err = s.extents.SplitExtentKey(s.inode, extKey); err != nil {
log.LogErrorf("action[doDirectWriteByAppend] inode %v llocal cache process err %v", s.inode, err)
return
}
log.LogDebugf("action[doDirectWriteByAppend] inode %v meta extent split with ek (%v)", s.inode, extKey)
if err = s.client.splitExtentKey(s.parentInode, s.inode, *extKey); err != nil {
log.LogErrorf("action[doDirectWriteByAppend] inode %v meta extent split process err %v", s.inode, err)
return
}
} else {
discards := s.extents.Append(extKey, true)
var st int
if st, err = s.client.appendExtentKey(s.parentInode, s.inode, *extKey, discards); err != nil {
status = int32(st)
log.LogErrorf("action[doDirectWriteByAppend] inode %v meta extent split process err %v", s.inode, err)
return
}
log.LogDebugf("action[doDirectWriteByAppend] handler fileoffset %v size %v key %v", s.handler.fileOffset, s.handler.size, s.handler.key)
// adjust the handler key to last direct write one
s.handler.fileOffset = int(extKey.FileOffset)
s.handler.size = int(extKey.Size)
s.handler.key = extKey
}
if atomic.LoadInt32(&s.needUpdateVer) > 0 {
if err = s.GetExtentsForce(); err != nil {
log.LogErrorf("action[doDirectWriteByAppend] inode %v GetExtents err %v", s.inode, err)
return
}
}
log.LogDebugf("action[doDirectWriteByAppend] inode %v process over!", s.inode)
return
}
func (s *Streamer) doOverwrite(req *ExtentRequest, direct bool) (total int, err error) {
var dp *wrapper.DataPartition
err = s.flush()
if err != nil {
return
}
offset := req.FileOffset
size := req.Size
// the extent key needs to be updated because when preparing the requests,
// the obtained extent key could be a local key which can be inconsistent with the remote key.
req.ExtentKey = s.extents.Get(uint64(offset))
ekFileOffset := int(req.ExtentKey.FileOffset)
ekExtOffset := int(req.ExtentKey.ExtentOffset)
if req.ExtentKey == nil {
err = errors.New(fmt.Sprintf("doOverwrite: extent key not exist, ino(%v) ekFileOffset(%v) ek(%v)", s.inode, ekFileOffset, req.ExtentKey))
return
}
if dp, err = s.client.dataWrapper.GetDataPartition(req.ExtentKey.PartitionId); err != nil {
// TODO unhandled error
errors.Trace(err, "doOverwrite: ino(%v) failed to get datapartition, ek(%v)", s.inode, req.ExtentKey)
return
}
retry := true
if proto.IsCold(s.client.volumeType) {
retry = false
}
sc := NewStreamConn(dp, false)
for total < size {
reqPacket := NewOverwritePacket(dp, req.ExtentKey.ExtentId, offset-ekFileOffset+total+ekExtOffset, s.inode, offset)
reqPacket.VerSeq = s.client.multiVerMgr.latestVerSeq
reqPacket.VerList = make([]*proto.VolVersionInfo, len(s.client.multiVerMgr.verList.VerList))
copy(reqPacket.VerList, s.client.multiVerMgr.verList.VerList)
reqPacket.ExtentType |= proto.MultiVersionFlag
reqPacket.ExtentType |= proto.VersionListFlag
log.LogDebugf("action[doOverwrite] inode %v extentid %v,extentOffset %v(%v,%v,%v,%v) offset %v, streamer seq %v", s.inode, req.ExtentKey.ExtentId, reqPacket.ExtentOffset,
offset, ekFileOffset, total, ekExtOffset, offset, s.verSeq)
if direct {
reqPacket.Opcode = proto.OpSyncRandomWrite
}
packSize := util.Min(size-total, util.BlockSize)
copy(reqPacket.Data[:packSize], req.Data[total:total+packSize])
reqPacket.Size = uint32(packSize)
reqPacket.CRC = crc32.ChecksumIEEE(reqPacket.Data[:packSize])
reqPacket.VerSeq = s.verSeq
replyPacket := new(Packet)
err = sc.Send(&retry, reqPacket, func(conn *net.TCPConn) (error, bool) {
e := replyPacket.ReadFromConnWithVer(conn, proto.ReadDeadlineTime)
if e != nil {
log.LogWarnf("Stream Writer doOverwrite: ino(%v) failed to read from connect, req(%v) err(%v)", s.inode, reqPacket, e)
// Upon receiving TryOtherAddrError, other hosts will be retried.
return TryOtherAddrError, false
}
log.LogDebugf("action[doOverwrite] streamer verseq (%v) datanode rsp seq (%v) code(%v)", s.verSeq, replyPacket.VerSeq, replyPacket.ResultCode)
if replyPacket.ResultCode == proto.OpAgain {
return nil, true
}
if replyPacket.ResultCode == proto.OpTryOtherAddr {
e = TryOtherAddrError
}
if replyPacket.ResultCode == proto.ErrCodeVersionOpError {
e = proto.ErrCodeVersionOp
log.LogDebugf("action[doOverwrite] .UpdateLatestVer verseq (%v) be updated by datanode rsp (%v) ", s.verSeq, replyPacket)
s.verSeq = replyPacket.VerSeq
s.extents.verSeq = s.verSeq
s.client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: replyPacket.VerList})
return e, false
}
return e, false
})
proto.Buffers.Put(reqPacket.Data)
reqPacket.Data = nil
log.LogDebugf("doOverwrite: ino(%v) req(%v) reqPacket(%v) err(%v) replyPacket(%v)", s.inode, req, reqPacket, err, replyPacket)
if err != nil || replyPacket.ResultCode != proto.OpOk {
if replyPacket.ResultCode == proto.ErrCodeVersionOpError {
err = proto.ErrCodeVersionOp
log.LogWarnf("doOverwrite: need retry.ino(%v) req(%v) reqPacket(%v) err(%v) replyPacket(%v)", s.inode, req, reqPacket, err, replyPacket)
return
}
err = errors.New(fmt.Sprintf("doOverwrite: failed or reply NOK: err(%v) ino(%v) req(%v) replyPacket(%v)", err, s.inode, req, replyPacket))
break
}
if !reqPacket.isValidWriteReply(replyPacket) || reqPacket.CRC != replyPacket.CRC {
err = errors.New(fmt.Sprintf("doOverwrite: is not the corresponding reply, ino(%v) req(%v) replyPacket(%v)", s.inode, req, replyPacket))
break
}
total += packSize
}
return
}
func (s *Streamer) tryInitExtentHandlerByLastEk(offset, size int) (isLastEkVerNotEqual bool) {
storeMode := s.GetStoreMod(offset, size)
getEndEkFunc := func() *proto.ExtentKey {
if ek := s.extents.GetEndForAppendWrite(uint64(offset), s.verSeq, false); ek != nil && !storage.IsTinyExtent(ek.ExtentId) {
return ek
}
return nil
}
checkVerFunc := func(currentEK *proto.ExtentKey) {
if currentEK.GetSeq() != s.verSeq {
log.LogDebugf("tryInitExtentHandlerByLastEk. exist ek seq %v vs request seq %v", currentEK.GetSeq(), s.verSeq)
if int(currentEK.ExtentOffset)+int(currentEK.Size)+size > util.ExtentSize {
s.closeOpenHandler()
return
}
isLastEkVerNotEqual = true
}
}
initExtentHandlerFunc := func(currentEK *proto.ExtentKey) {
checkVerFunc(currentEK)
log.LogDebugf("tryInitExtentHandlerByLastEk: found ek in ExtentCache, extent_id(%v) req_offset(%v) req_size(%v), currentEK [%v] streamer seq %v",
currentEK.ExtentId, offset, size, currentEK, s.verSeq)
_, pidErr := s.client.dataWrapper.GetDataPartition(currentEK.PartitionId)
if pidErr == nil {
seq := currentEK.GetSeq()
if isLastEkVerNotEqual {
seq = s.verSeq
}
log.LogDebugf("tryInitExtentHandlerByLastEk NewExtentHandler")
handler := NewExtentHandler(s, int(currentEK.FileOffset), storeMode, int(currentEK.Size))
handler.key = &proto.ExtentKey{
FileOffset: currentEK.FileOffset,
PartitionId: currentEK.PartitionId,
ExtentId: currentEK.ExtentId,
ExtentOffset: currentEK.ExtentOffset,
Size: currentEK.Size,
SnapInfo: &proto.ExtSnapInfo{
VerSeq: seq,
},
}
handler.lastKey = *currentEK
if s.handler != nil {
log.LogDebugf("tryInitExtentHandlerByLastEk: close old handler, currentEK.PartitionId(%v)",
currentEK.PartitionId)
s.closeOpenHandler()
}
s.handler = handler
s.dirty = false
log.LogDebugf("tryInitExtentHandlerByLastEk: currentEK.PartitionId(%v) found", currentEK.PartitionId)
} else {
log.LogDebugf("tryInitExtentHandlerByLastEk: currentEK.PartitionId(%v) not found", currentEK.PartitionId)
}
}
if storeMode == proto.NormalExtentType {
if s.handler == nil {
log.LogDebugf("tryInitExtentHandlerByLastEk: handler nil")
if ek := getEndEkFunc(); ek != nil {
initExtentHandlerFunc(ek)
}
} else {
if s.handler.fileOffset+s.handler.size == offset {
if s.handler.key != nil {
checkVerFunc(s.handler.key)
}
return
} else {
if ek := getEndEkFunc(); ek != nil {
log.LogDebugf("tryInitExtentHandlerByLastEk: getEndEkFunc get ek %v", ek)
initExtentHandlerFunc(ek)
} else {
log.LogDebugf("tryInitExtentHandlerByLastEk: not found ek")
}
}
}
}
return
}
// First, attempt sequential writes using neighboring extent keys. If the last extent has a different version,
// it indicates that the extent may have been fully utilized by the previous version.
// Next, try writing and directly checking the extent at the datanode. If the extent cannot be reused, create a new extent for writing.
func (s *Streamer) doWriteAppend(req *ExtentRequest, direct bool) (writeSize int, err error) {
var status int32
// try append write, get response
log.LogDebugf("action[streamer.write] doWriteAppend req: ExtentKey(%v) FileOffset(%v) size(%v)",
req.ExtentKey, req.FileOffset, req.Size)
// First, attempt sequential writes using neighboring extent keys. If the last extent has a different version,
// it indicates that the extent may have been fully utilized by the previous version.
// Next, try writing and directly checking the extent at the datanode. If the extent cannot be reused, create a new extent for writing.
if writeSize, err, status = s.doWriteAppendEx(req.Data, req.FileOffset, req.Size, direct, true); status == LastEKVersionNotEqual {
log.LogDebugf("action[streamer.write] tryDirectAppendWrite req %v FileOffset %v size %v", req.ExtentKey, req.FileOffset, req.Size)
if writeSize, _, err, status = s.tryDirectAppendWrite(req, direct); status == int32(proto.OpTryOtherExtent) {
log.LogDebugf("action[streamer.write] doWriteAppend again req %v FileOffset %v size %v", req.ExtentKey, req.FileOffset, req.Size)
writeSize, err, _ = s.doWriteAppendEx(req.Data, req.FileOffset, req.Size, direct, false)
}
}
log.LogDebugf("action[streamer.write] doWriteAppend status %v err %v", status, err)
return
}
func (s *Streamer) doWriteAppendEx(data []byte, offset, size int, direct bool, reUseEk bool) (total int, err error, status int32) {
var (
ek *proto.ExtentKey
storeMode int
)
// Small files are usually written in a single write, so use tiny extent
// store only for the first write operation.
storeMode = s.GetStoreMod(offset, size)
log.LogDebugf("doWriteAppendEx enter: ino(%v) offset(%v) size(%v) storeMode(%v)", s.inode, offset, size, storeMode)
if proto.IsHot(s.client.volumeType) {
if reUseEk {
if isLastEkVerNotEqual := s.tryInitExtentHandlerByLastEk(offset, size); isLastEkVerNotEqual {
log.LogDebugf("doWriteAppendEx enter: ino(%v) tryInitExtentHandlerByLastEk worked but seq not equal", s.inode)
status = LastEKVersionNotEqual
return
}
} else if s.handler != nil {
s.closeOpenHandler()
}
for i := 0; i < MaxNewHandlerRetry; i++ {
if s.handler == nil {
s.handler = NewExtentHandler(s, offset, storeMode, 0)
s.dirty = false
} else if s.handler.storeMode != storeMode {
// store mode changed, so close open handler and start a new one
s.closeOpenHandler()
continue
}
ek, err = s.handler.write(data, offset, size, direct)
if err == nil && ek != nil {
ek.SetSeq(s.verSeq)
if !s.dirty {
s.dirtylist.Put(s.handler)
s.dirty = true
}
break
}
s.closeOpenHandler()
}
} else {
s.handler = NewExtentHandler(s, offset, storeMode, 0)
s.dirty = false
ek, err = s.handler.write(data, offset, size, direct)
if err == nil && ek != nil {
if !s.dirty {
s.dirtylist.Put(s.handler)
s.dirty = true
}
}
err = s.closeOpenHandler()
}
if err != nil || ek == nil {
log.LogErrorf("doWriteAppendEx error: ino(%v) offset(%v) size(%v) err(%v) ek(%v)", s.inode, offset, size, err, ek)
return
}
// This ek is just a local cache for PrepareWriteRequest, so ignore discard eks here.
_ = s.extents.Append(ek, false)
total = size
return
}
func (s *Streamer) flush() (err error) {
for {
element := s.dirtylist.Get()
if element == nil {
break
}
eh := element.Value.(*ExtentHandler)
log.LogDebugf("Streamer flush begin: eh(%v)", eh)
err = eh.flush()
if err != nil {
log.LogErrorf("Streamer flush failed: eh(%v)", eh)
return
}
eh.stream.dirtylist.Remove(element)
if eh.getStatus() == ExtentStatusOpen {
s.dirty = false
log.LogDebugf("Streamer flush handler open: eh(%v)", eh)
} else {
// TODO unhandled error
eh.cleanup()
log.LogDebugf("Streamer flush handler cleaned up: eh(%v)", eh)
}
log.LogDebugf("Streamer flush end: eh(%v)", eh)
}
return
}
func (s *Streamer) traverse() (err error) {
s.traversed++
length := s.dirtylist.Len()
for i := 0; i < length; i++ {
element := s.dirtylist.Get()
if element == nil {
break
}
eh := element.Value.(*ExtentHandler)
log.LogDebugf("Streamer traverse begin: eh(%v)", eh)
if eh.getStatus() >= ExtentStatusClosed {
// handler can be in different status such as close, recovery, and error,
// and therefore there can be packet that has not been flushed yet.
eh.flushPacket()
if atomic.LoadInt32(&eh.inflight) > 0 {
log.LogDebugf("Streamer traverse skipped: non-zero inflight, eh(%v)", eh)
continue
}
err = eh.appendExtentKey()
if err != nil {
log.LogWarnf("Streamer traverse abort: appendExtentKey failed, eh(%v) err(%v)", eh, err)
// set the streamer to error status to avoid further writes
if err == syscall.EIO {
atomic.StoreInt32(&eh.stream.status, StreamerError)
}
return
}
s.dirtylist.Remove(element)
eh.cleanup()
} else {
if s.traversed < streamWriterFlushPeriod {
log.LogDebugf("Streamer traverse skipped: traversed(%v) eh(%v)", s.traversed, eh)
continue
}
if err = eh.flush(); err != nil {
log.LogWarnf("Streamer traverse flush: eh(%v) err(%v)", eh, err)
}
}
log.LogDebugf("Streamer traverse end: eh(%v)", eh)
}
return
}
func (s *Streamer) closeOpenHandler() (err error) {
// just in case to avoid infinite loop
var cnt int = 2 * MaxPacketErrorCount
handler := s.handler
for handler != nil && cnt >= 0 {
handler.setClosed()
if s.dirtylist.Len() < MaxDirtyListLen {
handler.flushPacket()
} else {
// TODO unhandled error
err = s.handler.flush()
}
handler = handler.recoverHandler
cnt--
}
if s.handler != nil {
if !s.dirty {
// in case the current handler is not on the dirty list and will not get cleaned up
// TODO unhandled error
log.LogDebugf("action[Streamer.closeOpenHandler]")
s.handler.cleanup()
}
s.handler = nil
}
return err
}
func (s *Streamer) open() {
s.refcnt++
log.LogDebugf("open: streamer(%v) refcnt(%v)", s, s.refcnt)
}
func (s *Streamer) release() error {
s.refcnt--
s.closeOpenHandler()
err := s.flush()
if err != nil {
s.abort()
}
log.LogDebugf("release: streamer(%v) refcnt(%v)", s, s.refcnt)
return err
}
func (s *Streamer) evict() error {
s.client.streamerLock.Lock()
if s.refcnt > 0 || len(s.request) != 0 {
s.client.streamerLock.Unlock()
return errors.New(fmt.Sprintf("evict: streamer(%v) refcnt(%v)", s, s.refcnt))
}
if s.client.disableMetaCache || !s.needBCache {
delete(s.client.streamers, s.inode)
}
s.client.streamerLock.Unlock()
return nil
}
func (s *Streamer) abort() {
for {
element := s.dirtylist.Get()
if element == nil {
break
}
eh := element.Value.(*ExtentHandler)
s.dirtylist.Remove(element)
// TODO unhandled error
eh.cleanup()
}
}
func (s *Streamer) truncate(size int, fullPath string) error {
s.closeOpenHandler()
err := s.flush()
if err != nil {
return err
}
err = s.client.truncate(s.inode, uint64(size), fullPath)
if err != nil {
return err
}
oldsize, _ := s.extents.Size()
if oldsize <= size {
s.extents.SetSize(uint64(size), true)
return nil
}
s.extents.TruncDiscard(uint64(size))
return s.GetExtentsForce()
}
func (s *Streamer) updateVer(verSeq uint64) (err error) {
log.LogInfof("action[stream.updateVer] ver %v update to %v", s.verSeq, verSeq)
if s.verSeq != verSeq {
log.LogInfof("action[stream.updateVer] ver %v update to %v", s.verSeq, verSeq)
s.verSeq = verSeq
s.extents.verSeq = verSeq
}
return
}
func (s *Streamer) tinySizeLimit() int {
return util.DefaultTinySizeLimit
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package wrapper
import (
"fmt"
"net"
"strings"
"sync"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
// DataPartition defines the wrapper of the data partition.
type DataPartition struct {
// Will not be changed
proto.DataPartitionResponse
RandomWrite bool
NearHosts []string
ClientWrapper *Wrapper
Metrics *DataPartitionMetrics
}
// DataPartitionMetrics defines the wrapper of the metrics related to the data partition.
type DataPartitionMetrics struct {
sync.RWMutex
AvgReadLatencyNano int64
AvgWriteLatencyNano int64
SumReadLatencyNano int64
SumWriteLatencyNano int64
ReadOpNum int64
WriteOpNum int64
}
func (dp *DataPartition) RecordWrite(startT int64) {
if startT == 0 {
log.LogWarnf("RecordWrite: invalid start time")
return
}
cost := time.Now().UnixNano() - startT
dp.Metrics.Lock()
defer dp.Metrics.Unlock()
dp.Metrics.WriteOpNum++
dp.Metrics.SumWriteLatencyNano += cost
return
}
func (dp *DataPartition) MetricsRefresh() {
dp.Metrics.Lock()
defer dp.Metrics.Unlock()
if dp.Metrics.ReadOpNum != 0 {
dp.Metrics.AvgReadLatencyNano = dp.Metrics.SumReadLatencyNano / dp.Metrics.ReadOpNum
} else {
dp.Metrics.AvgReadLatencyNano = 0
}
if dp.Metrics.WriteOpNum != 0 {
dp.Metrics.AvgWriteLatencyNano = dp.Metrics.SumWriteLatencyNano / dp.Metrics.WriteOpNum
} else {
dp.Metrics.AvgWriteLatencyNano = 0
}
dp.Metrics.SumReadLatencyNano = 0
dp.Metrics.SumWriteLatencyNano = 0
dp.Metrics.ReadOpNum = 0
dp.Metrics.WriteOpNum = 0
}
func (dp *DataPartition) GetAvgRead() int64 {
dp.Metrics.RLock()
defer dp.Metrics.RUnlock()
return dp.Metrics.AvgReadLatencyNano
}
func (dp *DataPartition) GetAvgWrite() int64 {
dp.Metrics.RLock()
defer dp.Metrics.RUnlock()
return dp.Metrics.AvgWriteLatencyNano
}
type DataPartitionSorter []*DataPartition
func (ds DataPartitionSorter) Len() int {
return len(ds)
}
func (ds DataPartitionSorter) Swap(i, j int) {
ds[i], ds[j] = ds[j], ds[i]
}
func (ds DataPartitionSorter) Less(i, j int) bool {
return ds[i].Metrics.AvgWriteLatencyNano < ds[j].Metrics.AvgWriteLatencyNano
}
// NewDataPartitionMetrics returns a new DataPartitionMetrics instance.
func NewDataPartitionMetrics() *DataPartitionMetrics {
metrics := new(DataPartitionMetrics)
return metrics
}
// String returns the string format of the data partition.
func (dp *DataPartition) String() string {
return fmt.Sprintf("PartitionID(%v) Type(%v), Status(%v) ReplicaNum(%v) Hosts(%v) NearHosts(%v)",
dp.PartitionID, dp.PartitionType, dp.Status, dp.ReplicaNum, dp.Hosts, dp.NearHosts)
}
func (dp *DataPartition) CheckAllHostsIsAvail(exclude map[string]struct{}) {
var (
conn net.Conn
err error
)
for i := 0; i < len(dp.Hosts); i++ {
host := dp.Hosts[i]
if conn, err = util.DailTimeOut(host, proto.ReadDeadlineTime*time.Second); err != nil {
log.LogWarnf("CheckAllHostsIsAvail: dial host (%v) err(%v)", host, err)
if strings.Contains(err.Error(), syscall.ECONNREFUSED.Error()) {
exclude[host] = struct{}{}
}
continue
}
conn.Close()
}
}
// GetAllAddrs returns the addresses of all the replicas of the data partition.
func (dp *DataPartition) GetAllAddrs() string {
return strings.Join(dp.Hosts[1:], proto.AddrSplit) + proto.AddrSplit
}
func isExcluded(dp *DataPartition, exclude map[string]struct{}) bool {
for _, host := range dp.Hosts {
if _, exist := exclude[host]; exist {
return true
}
}
return false
}
// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package wrapper
import (
"errors"
"strings"
"github.com/cubefs/cubefs/util/log"
)
// This type defines the constructor used to create and initialize the selector.
type DataPartitionSelectorConstructor = func(param string) (DataPartitionSelector, error)
// DataPartitionSelector is the interface defines the methods necessary to implement
// a selector for data partition selecting.
type DataPartitionSelector interface {
// Name return name of current selector instance.
Name() string
// Refresh refreshes current selector instance by specified data partitions.
Refresh(partitions []*DataPartition) error
// Select returns an data partition picked by selector.
Select(excludes map[string]struct{}) (*DataPartition, error)
// RemoveDP removes specified data partition.
RemoveDP(partitionID uint64)
// Count return number of data partitions held by selector.
Count() int
}
var (
dataPartitionSelectorConstructors = make(map[string]DataPartitionSelectorConstructor)
ErrDuplicatedDataPartitionSelectorConstructor = errors.New("duplicated data partition selector constructor")
ErrDataPartitionSelectorConstructorNotExist = errors.New("data partition selector constructor not exist")
)
// RegisterDataPartitionSelector registers a selector constructor.
// Users can register their own defined selector through this method.
func RegisterDataPartitionSelector(name string, constructor DataPartitionSelectorConstructor) error {
clearName := strings.TrimSpace(strings.ToLower(name))
if _, exist := dataPartitionSelectorConstructors[clearName]; exist {
return ErrDuplicatedDataPartitionSelectorConstructor
}
dataPartitionSelectorConstructors[clearName] = constructor
return nil
}
func newDataPartitionSelector(name string, param string) (newDpSelector DataPartitionSelector, err error) {
clearName := strings.TrimSpace(strings.ToLower(name))
constructor, exist := dataPartitionSelectorConstructors[clearName]
if !exist {
return nil, ErrDataPartitionSelectorConstructorNotExist
}
return constructor(param)
}
func (w *Wrapper) initDpSelector() (err error) {
w.dpSelectorChanged = false
selectorName := w.dpSelectorName
if strings.TrimSpace(selectorName) == "" {
log.LogInfof("initDpSelector: can not find dp selector[%v], use default selector", w.dpSelectorName)
selectorName = DefaultRandomSelectorName
}
var selector DataPartitionSelector
if selector, err = newDataPartitionSelector(selectorName, w.dpSelectorParm); err != nil {
log.LogErrorf("initDpSelector: dpSelector[%v] init failed caused by [%v], use default selector", w.dpSelectorName,
err)
return
}
w.dpSelector = selector
return
}
func (w *Wrapper) refreshDpSelector(partitions []*DataPartition) {
w.Lock.RLock()
dpSelector := w.dpSelector
dpSelectorChanged := w.dpSelectorChanged
w.Lock.RUnlock()
if dpSelectorChanged {
selectorName := w.dpSelectorName
if strings.TrimSpace(selectorName) == "" {
log.LogWarnf("refreshDpSelector: can not find dp selector[%v], use default selector", w.dpSelectorName)
selectorName = DefaultRandomSelectorName
}
newDpSelector, err := newDataPartitionSelector(selectorName, w.dpSelectorParm)
if err != nil {
log.LogErrorf("refreshDpSelector: change dpSelector to [%v %v] failed caused by [%v],"+
" use last valid selector. Please change dpSelector config through master.",
w.dpSelectorName, w.dpSelectorParm, err)
} else {
w.Lock.Lock()
log.LogInfof("refreshDpSelector: change dpSelector to [%v %v]", w.dpSelectorName, w.dpSelectorParm)
w.dpSelector = newDpSelector
w.dpSelectorChanged = false
dpSelector = newDpSelector
w.Lock.Unlock()
}
}
_ = dpSelector.Refresh(partitions)
}
// getDataPartitionForWrite returns an available data partition for write.
func (w *Wrapper) GetDataPartitionForWrite(exclude map[string]struct{}) (*DataPartition, error) {
w.Lock.RLock()
dpSelector := w.dpSelector
w.Lock.RUnlock()
return dpSelector.Select(exclude)
}
func (w *Wrapper) RemoveDataPartitionForWrite(partitionID uint64) {
w.Lock.RLock()
dpSelector := w.dpSelector
w.Lock.RUnlock()
if dpSelector.Count() <= 1 {
return
}
dpSelector.RemoveDP(partitionID)
}
// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package wrapper
import (
"fmt"
"math/rand"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/util/log"
)
const (
DefaultRandomSelectorName = "default"
)
func init() {
_ = RegisterDataPartitionSelector(DefaultRandomSelectorName, newDefaultRandomSelector)
}
func newDefaultRandomSelector(_ string) (selector DataPartitionSelector, e error) {
selector = &DefaultRandomSelector{
localLeaderPartitions: make([]*DataPartition, 0),
partitions: make([]*DataPartition, 0),
}
return
}
type DefaultRandomSelector struct {
sync.RWMutex
localLeaderPartitions []*DataPartition
partitions []*DataPartition
}
func (s *DefaultRandomSelector) Name() string {
return DefaultRandomSelectorName
}
func (s *DefaultRandomSelector) Refresh(partitions []*DataPartition) (err error) {
var localLeaderPartitions []*DataPartition
for i := 0; i < len(partitions); i++ {
if strings.Split(partitions[i].Hosts[0], ":")[0] == LocalIP {
localLeaderPartitions = append(localLeaderPartitions, partitions[i])
}
}
s.Lock()
defer s.Unlock()
s.localLeaderPartitions = localLeaderPartitions
s.partitions = partitions
return
}
func (s *DefaultRandomSelector) Select(exclude map[string]struct{}) (dp *DataPartition, err error) {
dp = s.getLocalLeaderDataPartition(exclude)
if dp != nil {
return dp, nil
}
s.RLock()
partitions := s.partitions
s.RUnlock()
dp = s.getRandomDataPartition(partitions, exclude)
if dp != nil {
return dp, nil
}
log.LogErrorf("DefaultRandomSelector: no writable data partition with %v partitions and exclude(%v)",
len(partitions), exclude)
return nil, fmt.Errorf("no writable data partition")
}
func (s *DefaultRandomSelector) RemoveDP(partitionID uint64) {
s.RLock()
rwPartitionGroups := s.partitions
localLeaderPartitions := s.localLeaderPartitions
s.RUnlock()
var i int
for i = 0; i < len(rwPartitionGroups); i++ {
if rwPartitionGroups[i].PartitionID == partitionID {
break
}
}
if i >= len(rwPartitionGroups) {
return
}
newRwPartition := make([]*DataPartition, 0)
newRwPartition = append(newRwPartition, rwPartitionGroups[:i]...)
newRwPartition = append(newRwPartition, rwPartitionGroups[i+1:]...)
defer func() {
s.Lock()
s.partitions = newRwPartition
s.Unlock()
}()
for i = 0; i < len(localLeaderPartitions); i++ {
if localLeaderPartitions[i].PartitionID == partitionID {
break
}
}
if i >= len(localLeaderPartitions) {
return
}
newLocalLeaderPartitions := make([]*DataPartition, 0)
newLocalLeaderPartitions = append(newLocalLeaderPartitions, localLeaderPartitions[:i]...)
newLocalLeaderPartitions = append(newLocalLeaderPartitions, localLeaderPartitions[i+1:]...)
s.Lock()
defer s.Unlock()
s.localLeaderPartitions = newLocalLeaderPartitions
return
}
func (s *DefaultRandomSelector) Count() int {
s.RLock()
defer s.RUnlock()
return len(s.partitions)
}
func (s *DefaultRandomSelector) getLocalLeaderDataPartition(exclude map[string]struct{}) *DataPartition {
s.RLock()
localLeaderPartitions := s.localLeaderPartitions
s.RUnlock()
return s.getRandomDataPartition(localLeaderPartitions, exclude)
}
func (s *DefaultRandomSelector) getRandomDataPartition(partitions []*DataPartition, exclude map[string]struct{}) (
dp *DataPartition) {
length := len(partitions)
if length == 0 {
return nil
}
rand.Seed(time.Now().UnixNano())
index := rand.Intn(length)
dp = partitions[index]
if !isExcluded(dp, exclude) {
log.LogDebugf("DefaultRandomSelector: select dp[%v] address[%p], index %v", dp, dp, index)
return dp
}
log.LogWarnf("DefaultRandomSelector: first random partition was excluded, get partition from others")
var currIndex int
for i := 0; i < length; i++ {
currIndex = (index + i) % length
if !isExcluded(partitions[currIndex], exclude) {
log.LogDebugf("DefaultRandomSelector: select dp[%v], index %v", partitions[currIndex], currIndex)
return partitions[currIndex]
}
}
return nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package wrapper
import (
"fmt"
"math/rand"
"strconv"
"sync"
"time"
"github.com/cubefs/cubefs/util/log"
)
const (
KFasterRandomSelectorName = "kfaster"
)
func init() {
_ = RegisterDataPartitionSelector(KFasterRandomSelectorName, newKFasterRandomSelector)
}
func newKFasterRandomSelector(selectorParam string) (selector DataPartitionSelector, e error) {
param, err := strconv.Atoi(selectorParam)
if err != nil {
return nil, fmt.Errorf("KFasterRandomSelector: get param failed[%v]", err)
}
if (param <= 0) || (param >= 100) {
return nil, fmt.Errorf("KFasterRandomSelector: invalid param[%v]", param)
}
selector = &KFasterRandomSelector{
kValueHundred: param,
partitions: make([]*DataPartition, 0),
}
log.LogInfof("KFasterRandomSelector: init selector success, kValueHundred is %v", param)
return
}
type KFasterRandomSelector struct {
sync.RWMutex
kValueHundred int
kValue int
partitions []*DataPartition
}
func (s *KFasterRandomSelector) Name() string {
return KFasterRandomSelectorName
}
func (s *KFasterRandomSelector) Refresh(partitions []*DataPartition) (err error) {
kValue := (len(partitions)-1)*s.kValueHundred/100 + 1
selectKminDataPartition(partitions, kValue)
s.Lock()
defer s.Unlock()
s.kValue = kValue
s.partitions = partitions
return
}
func (s *KFasterRandomSelector) Select(exclude map[string]struct{}) (dp *DataPartition, err error) {
s.RLock()
partitions := s.partitions
kValue := s.kValue
s.RUnlock()
if len(partitions) == 0 {
log.LogError("KFasterRandomSelector: no writable data partition with empty partitions")
return nil, fmt.Errorf("no writable data partition")
}
// select random dataPartition from fasterRwPartitions
rand.Seed(time.Now().UnixNano())
index := rand.Intn(kValue)
dp = partitions[index]
if !isExcluded(dp, exclude) {
log.LogDebugf("KFasterRandomSelector: select faster dp[%v], index %v, kValue(%v/%v)",
dp, index, kValue, len(partitions))
return dp, nil
}
log.LogWarnf("KFasterRandomSelector: first random fasterRwPartition was excluded, get partition from other faster")
// if partitions[index] is excluded, select next in fasterRwPartitions
for i := 1; i < kValue; i++ {
dp = partitions[(index+i)%kValue]
if !isExcluded(dp, exclude) {
log.LogDebugf("KFasterRandomSelector: select faster dp[%v], index %v, kValue(%v/%v)",
dp, (index+i)%kValue, kValue, len(partitions))
return dp, nil
}
}
log.LogWarnf("KFasterRandomSelector: all fasterRwPartitions were excluded, get partition from slower")
// if all fasterRwPartitions are excluded, select random dataPartition in slowerRwPartitions
slowerRwPartitionsNum := len(partitions) - kValue
for i := 0; i < slowerRwPartitionsNum; i++ {
dp = partitions[(index+i)%slowerRwPartitionsNum+kValue]
if !isExcluded(dp, exclude) {
log.LogDebugf("KFasterRandomSelector: select slower dp[%v], index %v, kValue(%v/%v)",
dp, (index+i)%slowerRwPartitionsNum+kValue, kValue, len(partitions))
return dp, nil
}
}
log.LogErrorf("KFasterRandomSelector: no writable data partition with %v partitions and exclude(%v)",
len(partitions), exclude)
return nil, fmt.Errorf("no writable data partition")
}
func (s *KFasterRandomSelector) RemoveDP(partitionID uint64) {
s.RLock()
partitions := s.partitions
s.RUnlock()
var i int
for i = 0; i < len(partitions); i++ {
if partitions[i].PartitionID == partitionID {
break
}
}
if i >= len(partitions) {
return
}
newRwPartition := make([]*DataPartition, 0)
newRwPartition = append(newRwPartition, partitions[:i]...)
newRwPartition = append(newRwPartition, partitions[i+1:]...)
s.Refresh(newRwPartition)
return
}
func (s *KFasterRandomSelector) Count() int {
s.RLock()
defer s.RUnlock()
return len(s.partitions)
}
func swap(s []*DataPartition, i int, j int) {
s[i], s[j] = s[j], s[i]
}
func partByPrivot(partitions []*DataPartition, low, high int) int {
var i, j int
for {
for i = low + 1; i < high; i++ {
if partitions[i].GetAvgWrite() > partitions[low].GetAvgWrite() {
break
}
}
for j = high; j > low; j-- {
if partitions[j].GetAvgWrite() <= partitions[low].GetAvgWrite() {
break
}
}
if i >= j {
break
}
swap(partitions, i, j)
}
if low != j {
swap(partitions, low, j)
}
return j
}
func selectKminDataPartition(partitions []*DataPartition, k int) int {
if len(partitions) <= 1 {
return k
}
low, high := 0, len(partitions)-1
for {
privot := partByPrivot(partitions, low, high)
if privot < k {
low = privot + 1
} else if privot > k {
high = privot - 1
} else {
return k
}
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package wrapper
import (
"fmt"
syslog "log"
"math"
"net"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
masterSDK "github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/iputil"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/ump"
)
var (
LocalIP string
DefaultMinWriteAbleDataPartitionCnt = 10
)
type DataPartitionView struct {
DataPartitions []*DataPartition
}
type SimpleClientInfo interface {
GetFlowInfo() (*proto.ClientReportLimitInfo, bool)
UpdateFlowInfo(limit *proto.LimitRsp2Client)
SetClientID(id uint64) error
UpdateLatestVer(verList *proto.VolVersionInfoList) error
GetReadVer() uint64
GetLatestVer() uint64
GetVerMgr() *proto.VolVersionInfoList
}
// Wrapper TODO rename. This name does not reflect what it is doing.
type Wrapper struct {
Lock sync.RWMutex
clusterName string
volName string
volType int
EnablePosixAcl bool
masters []string
partitions map[uint64]*DataPartition
followerRead bool
followerReadClientCfg bool
nearRead bool
dpSelectorChanged bool
dpSelectorName string
dpSelectorParm string
mc *masterSDK.MasterClient
stopOnce sync.Once
stopC chan struct{}
dpSelector DataPartitionSelector
HostsStatus map[string]bool
Uids map[uint32]*proto.UidSimpleInfo
UidLock sync.RWMutex
preload bool
LocalIp string
minWriteAbleDataPartitionCnt int
verConfReadSeq uint64
verReadSeq uint64
SimpleClient SimpleClientInfo
}
func (w *Wrapper) GetMasterClient() *masterSDK.MasterClient {
return w.mc
}
// NewDataPartitionWrapper returns a new data partition wrapper.
func NewDataPartitionWrapper(client SimpleClientInfo, volName string, masters []string, preload bool, minWriteAbleDataPartitionCnt int, verReadSeq uint64) (w *Wrapper, err error) {
log.LogInfof("action[NewDataPartitionWrapper] verReadSeq %v", verReadSeq)
w = new(Wrapper)
w.stopC = make(chan struct{})
w.masters = masters
w.mc = masterSDK.NewMasterClient(masters, false)
w.volName = volName
w.partitions = make(map[uint64]*DataPartition)
w.HostsStatus = make(map[string]bool)
w.preload = preload
w.minWriteAbleDataPartitionCnt = minWriteAbleDataPartitionCnt
if w.minWriteAbleDataPartitionCnt < 0 {
w.minWriteAbleDataPartitionCnt = DefaultMinWriteAbleDataPartitionCnt
}
if w.LocalIp, err = ump.GetLocalIpAddr(); err != nil {
err = errors.Trace(err, "NewDataPartitionWrapper:")
return
}
if err = w.updateClusterInfo(); err != nil {
err = errors.Trace(err, "NewDataPartitionWrapper:")
return
}
if err = w.GetSimpleVolView(); err != nil {
err = errors.Trace(err, "NewDataPartitionWrapper:")
return
}
w.UploadFlowInfo(client, true)
if err = w.initDpSelector(); err != nil {
log.LogErrorf("NewDataPartitionWrapper: init initDpSelector failed, [%v]", err)
}
if err = w.updateDataPartition(true); err != nil {
err = errors.Trace(err, "NewDataPartitionWrapper:")
return
}
if err = w.updateDataNodeStatus(); err != nil {
log.LogErrorf("NewDataPartitionWrapper: init DataNodeStatus failed, [%v]", err)
}
w.verConfReadSeq = verReadSeq
if verReadSeq > 0 {
var verList *proto.VolVersionInfoList
if verList, err = w.mc.AdminAPI().GetVerList(volName); err != nil {
return
}
if verReadSeq, err = w.CheckReadVerSeq(volName, verReadSeq, verList); err != nil {
log.LogErrorf("NewDataPartitionWrapper: init Read with ver [%v] error [%v]", verReadSeq, err)
return
}
}
w.verReadSeq = verReadSeq
w.SimpleClient = client
go w.uploadFlowInfoByTick(client)
go w.update(client)
return
}
func (w *Wrapper) Stop() {
w.stopOnce.Do(func() {
close(w.stopC)
})
}
func (w *Wrapper) InitFollowerRead(clientConfig bool) {
w.followerReadClientCfg = clientConfig
w.followerRead = w.followerReadClientCfg || w.followerRead
}
func (w *Wrapper) FollowerRead() bool {
return w.followerRead
}
func (w *Wrapper) tryGetPartition(index uint64) (partition *DataPartition, ok bool) {
w.Lock.RLock()
defer w.Lock.RUnlock()
partition, ok = w.partitions[index]
return
}
func (w *Wrapper) updateClusterInfo() (err error) {
var info *proto.ClusterInfo
if info, err = w.mc.AdminAPI().GetClusterInfo(); err != nil {
log.LogWarnf("UpdateClusterInfo: get cluster info fail: err(%v)", err)
return
}
log.LogInfof("UpdateClusterInfo: get cluster info: cluster(%v) localIP(%v)", info.Cluster, info.Ip)
w.clusterName = info.Cluster
LocalIP = info.Ip
return
}
func (w *Wrapper) UpdateUidsView(view *proto.SimpleVolView) {
w.UidLock.Lock()
defer w.UidLock.Unlock()
w.Uids = make(map[uint32]*proto.UidSimpleInfo)
for _, uid := range view.Uids {
if !uid.Limited {
continue
}
w.Uids[uid.UID] = &uid
}
log.LogDebugf("uid info be updated to %v", view.Uids)
}
func (w *Wrapper) GetSimpleVolView() (err error) {
var view *proto.SimpleVolView
if view, err = w.mc.AdminAPI().GetVolumeSimpleInfo(w.volName); err != nil {
log.LogWarnf("GetSimpleVolView: get volume simple info fail: volume(%v) err(%v)", w.volName, err)
return
}
if view.Status == 1 {
log.LogWarnf("GetSimpleVolView: volume has been marked for deletion: volume(%v) status(%v - 0:normal/1:markDelete)",
w.volName, view.Status)
return proto.ErrVolNotExists
}
w.followerRead = view.FollowerRead
w.dpSelectorName = view.DpSelectorName
w.dpSelectorParm = view.DpSelectorParm
w.volType = view.VolType
w.EnablePosixAcl = view.EnablePosixAcl
w.UpdateUidsView(view)
log.LogDebugf("GetSimpleVolView: get volume simple info: ID(%v) name(%v) owner(%v) status(%v) capacity(%v) "+
"metaReplicas(%v) dataReplicas(%v) mpCnt(%v) dpCnt(%v) followerRead(%v) createTime(%v) dpSelectorName(%v) "+
"dpSelectorParm(%v) uids(%v)",
view.ID, view.Name, view.Owner, view.Status, view.Capacity, view.MpReplicaNum, view.DpReplicaNum, view.MpCnt,
view.DpCnt, view.FollowerRead, view.CreateTime, view.DpSelectorName, view.DpSelectorParm, view.Uids)
return
}
func (w *Wrapper) uploadFlowInfoByTick(clientInfo SimpleClientInfo) {
ticker := time.NewTicker(5 * time.Second)
for {
select {
case <-ticker.C:
w.UploadFlowInfo(clientInfo, false)
case <-w.stopC:
return
}
}
}
func (w *Wrapper) update(clientInfo SimpleClientInfo) {
ticker := time.NewTicker(time.Minute)
taskFunc := func() {
w.updateSimpleVolView()
w.updateDataPartition(false)
w.updateDataNodeStatus()
w.CheckPermission()
w.updateVerlist(clientInfo)
}
taskFunc()
for {
select {
case <-ticker.C:
taskFunc()
case <-w.stopC:
return
}
}
}
func (w *Wrapper) UploadFlowInfo(clientInfo SimpleClientInfo, init bool) (err error) {
var limitRsp *proto.LimitRsp2Client
flowInfo, isNeedReport := clientInfo.GetFlowInfo()
if !isNeedReport {
log.LogDebugf("action[UploadFlowInfo] no need report!")
return nil
}
if limitRsp, err = w.mc.AdminAPI().UploadFlowInfo(w.volName, flowInfo); err != nil {
log.LogWarnf("UpdateSimpleVolView: get volume simple info fail: volume(%v) err(%v)", w.volName, err)
return
}
if init {
if limitRsp.ID == 0 {
err = fmt.Errorf("init client get id 0")
log.LogInfof("action[UploadFlowInfo] err %v", err.Error())
return
}
log.LogInfof("action[UploadFlowInfo] get id %v", limitRsp.ID)
clientInfo.SetClientID(limitRsp.ID)
}
clientInfo.UpdateFlowInfo(limitRsp)
return
}
func (w *Wrapper) CheckPermission() {
if info, err := w.mc.UserAPI().AclOperation(w.volName, w.LocalIp, util.AclCheckIP); err != nil {
syslog.Println(err)
} else if !info.OK {
syslog.Println(err)
log.LogFatal("Client Addr not allowed to access CubeFS Cluster!")
}
}
func (w *Wrapper) updateVerlist(client SimpleClientInfo) (err error) {
verList, err := w.mc.AdminAPI().GetVerList(w.volName)
if err != nil {
log.LogErrorf("CheckReadVerSeq: get cluster fail: err(%v)", err)
return err
}
if verList == nil {
msg := fmt.Sprintf("get verList nil, vol [%v] reqd seq [%v]", w.volName, w.verReadSeq)
log.LogErrorf("action[CheckReadVerSeq] %v", msg)
return fmt.Errorf("%v", msg)
}
if w.verReadSeq > 0 {
if _, err = w.CheckReadVerSeq(w.volName, w.verConfReadSeq, verList); err != nil {
log.LogFatalf("updateSimpleVolView: readSeq abnormal %v", err)
}
return
}
log.LogDebugf("updateSimpleVolView.UpdateLatestVer.try update to verlist[%v]", verList)
if err = client.UpdateLatestVer(verList); err != nil {
log.LogWarnf("updateSimpleVolView: UpdateLatestVer ver %v faile err %v", verList.GetLastVer(), err)
return
}
return
}
func (w *Wrapper) updateSimpleVolView() (err error) {
var view *proto.SimpleVolView
if view, err = w.mc.AdminAPI().GetVolumeSimpleInfo(w.volName); err != nil {
log.LogWarnf("updateSimpleVolView: get volume simple info fail: volume(%v) err(%v)", w.volName, err)
return
}
w.UpdateUidsView(view)
if w.followerRead != view.FollowerRead && !w.followerReadClientCfg {
log.LogDebugf("UpdateSimpleVolView: update followerRead from old(%v) to new(%v)",
w.followerRead, view.FollowerRead)
w.followerRead = view.FollowerRead
}
if w.dpSelectorName != view.DpSelectorName || w.dpSelectorParm != view.DpSelectorParm {
log.LogDebugf("UpdateSimpleVolView: update dpSelector from old(%v %v) to new(%v %v)",
w.dpSelectorName, w.dpSelectorParm, view.DpSelectorName, view.DpSelectorParm)
w.Lock.Lock()
w.dpSelectorName = view.DpSelectorName
w.dpSelectorParm = view.DpSelectorParm
w.dpSelectorChanged = true
w.Lock.Unlock()
}
return nil
}
func (w *Wrapper) updateDataPartitionByRsp(isInit bool, DataPartitions []*proto.DataPartitionResponse) (err error) {
convert := func(response *proto.DataPartitionResponse) *DataPartition {
return &DataPartition{
DataPartitionResponse: *response,
ClientWrapper: w,
}
}
if proto.IsCold(w.volType) {
w.clearPartitions()
}
rwPartitionGroups := make([]*DataPartition, 0)
for index, partition := range DataPartitions {
if partition == nil {
log.LogErrorf("action[updateDataPartitionByRsp] index [%v] is nil", index)
continue
}
dp := convert(partition)
if w.followerRead && w.nearRead {
dp.NearHosts = w.sortHostsByDistance(dp.Hosts)
}
log.LogInfof("updateDataPartition: dp(%v)", dp)
w.replaceOrInsertPartition(dp)
// do not insert preload dp in cold vol
if proto.IsCold(w.volType) && proto.IsPreLoadDp(dp.PartitionType) {
continue
}
if dp.Status == proto.ReadWrite {
dp.MetricsRefresh()
rwPartitionGroups = append(rwPartitionGroups, dp)
log.LogInfof("updateDataPartition: dp(%v) address(%p) insert to rwPartitionGroups", dp.PartitionID, dp)
}
}
// isInit used to identify whether this call is caused by mount action
if isInit || len(rwPartitionGroups) >= w.minWriteAbleDataPartitionCnt || (proto.IsCold(w.volType) && (len(rwPartitionGroups) >= 1)) {
log.LogInfof("updateDataPartition: refresh dpSelector of volume(%v) with %v rw partitions(%v all), isInit(%v), minWriteAbleDataPartitionCnt(%v)",
w.volName, len(rwPartitionGroups), len(DataPartitions), isInit, w.minWriteAbleDataPartitionCnt)
w.refreshDpSelector(rwPartitionGroups)
} else {
err = errors.New("updateDataPartition: no writable data partition")
log.LogWarnf("updateDataPartition: no enough writable data partitions, volume(%v) with %v rw partitions(%v all), isInit(%v), minWriteAbleDataPartitionCnt(%v)",
w.volName, len(rwPartitionGroups), len(DataPartitions), isInit, w.minWriteAbleDataPartitionCnt)
}
log.LogInfof("updateDataPartition: finish")
return err
}
func (w *Wrapper) updateDataPartition(isInit bool) (err error) {
if w.preload {
return
}
var dpv *proto.DataPartitionsView
if dpv, err = w.mc.ClientAPI().EncodingGzip().GetDataPartitions(w.volName); err != nil {
log.LogErrorf("updateDataPartition: get data partitions fail: volume(%v) err(%v)", w.volName, err)
return
}
log.LogInfof("updateDataPartition: get data partitions: volume(%v) partitions(%v)", w.volName, len(dpv.DataPartitions))
return w.updateDataPartitionByRsp(isInit, dpv.DataPartitions)
}
func (w *Wrapper) UpdateDataPartition() (err error) {
return w.updateDataPartition(false)
}
// getDataPartitionFromMaster will call master to get data partition info which not include in cache updated by
// updateDataPartition which may not take effect if nginx be placed for reduce the pressure of master
func (w *Wrapper) getDataPartitionFromMaster(isInit bool, dpId uint64) (err error) {
var dpInfo *proto.DataPartitionInfo
if dpInfo, err = w.mc.AdminAPI().GetDataPartition(w.volName, dpId); err != nil {
log.LogErrorf("getDataPartitionFromMaster: get data partitions fail: volume(%v) dpId(%v) err(%v)",
w.volName, dpId, err)
return
}
log.LogInfof("getDataPartitionFromMaster: get data partitions: volume(%v), dpId(%v)", w.volName, dpId)
var leaderAddr string
for _, replica := range dpInfo.Replicas {
if replica.IsLeader {
leaderAddr = replica.Addr
}
}
dpr := new(proto.DataPartitionResponse)
dpr.PartitionID = dpId
dpr.Status = dpInfo.Status
dpr.ReplicaNum = dpInfo.ReplicaNum
dpr.Hosts = make([]string, len(dpInfo.Hosts))
copy(dpr.Hosts, dpInfo.Hosts)
dpr.LeaderAddr = leaderAddr
dpr.IsRecover = dpInfo.IsRecover
dpr.IsDiscard = dpInfo.IsDiscard
DataPartitions := make([]*proto.DataPartitionResponse, 1)
DataPartitions = append(DataPartitions, dpr)
return w.updateDataPartitionByRsp(isInit, DataPartitions)
}
func (w *Wrapper) clearPartitions() {
w.Lock.Lock()
defer w.Lock.Unlock()
w.partitions = make(map[uint64]*DataPartition)
}
func (w *Wrapper) AllocatePreLoadDataPartition(volName string, count int, capacity, ttl uint64, zones string) (err error) {
var dpv *proto.DataPartitionsView
if dpv, err = w.mc.AdminAPI().CreatePreLoadDataPartition(volName, count, capacity, ttl, zones); err != nil {
log.LogWarnf("CreatePreLoadDataPartition fail: err(%v)", err)
return
}
convert := func(response *proto.DataPartitionResponse) *DataPartition {
return &DataPartition{
DataPartitionResponse: *response,
ClientWrapper: w,
}
}
rwPartitionGroups := make([]*DataPartition, 0)
for _, partition := range dpv.DataPartitions {
dp := convert(partition)
if proto.IsCold(w.volType) && !proto.IsPreLoadDp(dp.PartitionType) {
continue
}
log.LogInfof("updateDataPartition: dp(%v)", dp)
w.replaceOrInsertPartition(dp)
dp.MetricsRefresh()
rwPartitionGroups = append(rwPartitionGroups, dp)
}
w.refreshDpSelector(rwPartitionGroups)
return nil
}
func (w *Wrapper) replaceOrInsertPartition(dp *DataPartition) {
var oldstatus int8
w.Lock.Lock()
old, ok := w.partitions[dp.PartitionID]
if ok {
oldstatus = old.Status
old.Status = dp.Status
old.ReplicaNum = dp.ReplicaNum
old.Hosts = dp.Hosts
old.IsDiscard = dp.IsDiscard
old.NearHosts = dp.Hosts
dp.Metrics = old.Metrics
} else {
dp.Metrics = NewDataPartitionMetrics()
w.partitions[dp.PartitionID] = dp
}
w.Lock.Unlock()
if ok && oldstatus != dp.Status {
log.LogInfof("partition:dp[%v] address %p status change (%v) -> (%v)", dp.PartitionID, &old, oldstatus, dp.Status)
}
}
// GetDataPartition returns the data partition based on the given partition ID.
func (w *Wrapper) GetDataPartition(partitionID uint64) (*DataPartition, error) {
dp, ok := w.tryGetPartition(partitionID)
if !ok && !proto.IsCold(w.volType) { // cache miss && hot volume
err := w.getDataPartitionFromMaster(false, partitionID)
if err == nil {
dp, ok = w.tryGetPartition(partitionID)
if !ok {
return nil, fmt.Errorf("partition[%v] not exsit", partitionID)
}
return dp, nil
}
return nil, fmt.Errorf("partition[%v] not exsit", partitionID)
}
if !ok {
return nil, fmt.Errorf("partition[%v] not exsit", partitionID)
}
return dp, nil
}
func (w *Wrapper) GetReadVerSeq() uint64 {
return w.verReadSeq
}
func (w *Wrapper) CheckReadVerSeq(volName string, verReadSeq uint64, verList *proto.VolVersionInfoList) (readReadVer uint64, err error) {
w.Lock.RLock()
defer w.Lock.RUnlock()
log.LogInfof("action[CheckReadVerSeq] vol [%v] req seq [%v]", volName, verReadSeq)
readReadVer = verReadSeq
// Whether it is version 0 or any other version, there may be uncommitted versions between the requested version
// and the next official version. In this case, the data needs to be read.
if verReadSeq == math.MaxUint64 {
verReadSeq = 0
}
var (
id int
ver *proto.VolVersionInfo
verLen = len(verList.VerList)
)
for id, ver = range verList.VerList {
if id == verLen-1 {
err = fmt.Errorf("action[CheckReadVerSeq] readReadVer %v not found", readReadVer)
break
}
log.LogInfof("action[CheckReadVerSeq] ver %v,%v", ver.Ver, ver.Status)
if ver.Ver == verReadSeq {
if ver.Status != proto.VersionNormal {
err = fmt.Errorf("action[CheckReadVerSeq] status %v not right", ver.Status)
return
}
readReadVer = verList.VerList[id+1].Ver - 1
log.LogInfof("action[CheckReadVerSeq] get read ver %v", readReadVer)
return
}
}
err = fmt.Errorf("not found read ver %v", verReadSeq)
return
}
// WarningMsg returns the warning message that contains the cluster name.
func (w *Wrapper) WarningMsg() string {
return fmt.Sprintf("%s_client_warning", w.clusterName)
}
func (w *Wrapper) updateDataNodeStatus() (err error) {
var cv *proto.ClusterView
cv, err = w.mc.AdminAPI().GetCluster()
if err != nil {
log.LogErrorf("updateDataNodeStatus: get cluster fail: err(%v)", err)
return
}
newHostsStatus := make(map[string]bool)
for _, node := range cv.DataNodes {
newHostsStatus[node.Addr] = node.IsActive
}
log.LogInfof("updateDataNodeStatus: update %d hosts status", len(newHostsStatus))
w.HostsStatus = newHostsStatus
return
}
func (w *Wrapper) SetNearRead(nearRead bool) {
w.nearRead = nearRead
log.LogInfof("SetNearRead: set nearRead to %v", w.nearRead)
}
func (w *Wrapper) NearRead() bool {
return w.nearRead
}
// Sort hosts by distance form local
func (w *Wrapper) sortHostsByDistance(srcHosts []string) []string {
hosts := make([]string, len(srcHosts))
copy(hosts, srcHosts)
for i := 0; i < len(hosts); i++ {
for j := i + 1; j < len(hosts); j++ {
if distanceFromLocal(hosts[i]) > distanceFromLocal(hosts[j]) {
hosts[i], hosts[j] = hosts[j], hosts[i]
}
}
}
return hosts
}
func distanceFromLocal(b string) int {
remote := strings.Split(b, ":")[0]
return iputil.GetDistance(net.ParseIP(LocalIP), net.ParseIP(remote))
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"fmt"
"strconv"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
type AdminAPI struct {
mc *MasterClient
h map[string]string // extra headers
}
func (api *AdminAPI) WithHeader(key, val string) *AdminAPI {
return &AdminAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}
func (api *AdminAPI) EncodingWith(encoding string) *AdminAPI {
return api.WithHeader(headerAcceptEncoding, encoding)
}
func (api *AdminAPI) EncodingGzip() *AdminAPI {
return api.EncodingWith(encodingGzip)
}
func (api *AdminAPI) GetCluster() (cv *proto.ClusterView, err error) {
cv = &proto.ClusterView{}
err = api.mc.requestWith(cv, newRequest(get, proto.AdminGetCluster).Header(api.h))
return
}
func (api *AdminAPI) GetClusterNodeInfo() (cn *proto.ClusterNodeInfo, err error) {
cn = &proto.ClusterNodeInfo{}
err = api.mc.requestWith(cn, newRequest(get, proto.AdminGetNodeInfo).Header(api.h))
return
}
func (api *AdminAPI) GetClusterIP() (cp *proto.ClusterIP, err error) {
cp = &proto.ClusterIP{}
err = api.mc.requestWith(cp, newRequest(get, proto.AdminGetIP).Header(api.h))
return
}
func (api *AdminAPI) GetClusterStat() (cs *proto.ClusterStatInfo, err error) {
cs = &proto.ClusterStatInfo{}
err = api.mc.requestWith(cs, newRequest(get, proto.AdminClusterStat).Header(api.h).NoTimeout())
return
}
func (api *AdminAPI) ListZones() (zoneViews []*proto.ZoneView, err error) {
zoneViews = make([]*proto.ZoneView, 0)
err = api.mc.requestWith(&zoneViews, newRequest(get, proto.GetAllZones).Header(api.h))
return
}
func (api *AdminAPI) ListNodeSets(zoneName string) (nodeSetStats []*proto.NodeSetStat, err error) {
params := make([]anyParam, 0)
if zoneName != "" {
params = append(params, anyParam{"zoneName", zoneName})
}
nodeSetStats = make([]*proto.NodeSetStat, 0)
err = api.mc.requestWith(&nodeSetStats, newRequest(get, proto.GetAllNodeSets).Header(api.h).Param(params...))
return
}
func (api *AdminAPI) GetNodeSet(nodeSetId string) (nodeSetStatInfo *proto.NodeSetStatInfo, err error) {
nodeSetStatInfo = &proto.NodeSetStatInfo{}
err = api.mc.requestWith(nodeSetStatInfo, newRequest(get, proto.GetNodeSet).
Header(api.h).addParam("nodesetId", nodeSetId))
return
}
func (api *AdminAPI) UpdateNodeSet(nodeSetId string, dataNodeSelector string, metaNodeSelector string) (err error) {
return api.mc.request(newRequest(get, proto.UpdateNodeSet).Header(api.h).Param(
anyParam{"nodesetId", nodeSetId},
anyParam{"dataNodeSelector", dataNodeSelector},
anyParam{"metaNodeSelector", metaNodeSelector},
))
}
func (api *AdminAPI) UpdateZone(name string, enable bool, dataNodesetSelector string, metaNodesetSelector string, dataNodeSelector string, metaNodeSelector string) (err error) {
return api.mc.request(newRequest(post, proto.UpdateZone).Header(api.h).Param(
anyParam{"name", name},
anyParam{"enable", enable},
anyParam{"dataNodesetSelector", dataNodesetSelector},
anyParam{"metaNodesetSelector", metaNodesetSelector},
anyParam{"dataNodeSelector", dataNodeSelector},
anyParam{"metaNodeSelector", metaNodeSelector},
))
}
func (api *AdminAPI) Topo() (topo *proto.TopologyView, err error) {
topo = &proto.TopologyView{}
err = api.mc.requestWith(topo, newRequest(get, proto.GetTopologyView).Header(api.h))
return
}
func (api *AdminAPI) GetDataPartition(volName string, partitionID uint64) (partition *proto.DataPartitionInfo, err error) {
partition = &proto.DataPartitionInfo{}
err = api.mc.requestWith(partition, newRequest(get, proto.AdminGetDataPartition).
Header(api.h).Param(anyParam{"id", partitionID}, anyParam{"name", volName}))
return
}
func (api *AdminAPI) GetDataPartitionById(partitionID uint64) (partition *proto.DataPartitionInfo, err error) {
partition = &proto.DataPartitionInfo{}
err = api.mc.requestWith(partition, newRequest(get, proto.AdminGetDataPartition).
Header(api.h).addParamAny("id", partitionID))
return
}
func (api *AdminAPI) DiagnoseDataPartition(ignoreDiscardDp bool) (diagnosis *proto.DataPartitionDiagnosis, err error) {
diagnosis = &proto.DataPartitionDiagnosis{}
err = api.mc.requestWith(diagnosis, newRequest(get, proto.AdminDiagnoseDataPartition).
Header(api.h).addParamAny("ignoreDiscard", ignoreDiscardDp))
return
}
func (api *AdminAPI) DiagnoseMetaPartition() (diagnosis *proto.MetaPartitionDiagnosis, err error) {
diagnosis = &proto.MetaPartitionDiagnosis{}
err = api.mc.requestWith(diagnosis, newRequest(get, proto.AdminDiagnoseMetaPartition).Header(api.h))
return
}
func (api *AdminAPI) LoadDataPartition(volName string, partitionID uint64, clientIDKey string) (err error) {
return api.mc.request(newRequest(get, proto.AdminLoadDataPartition).Header(api.h).Param(
anyParam{"id", partitionID},
anyParam{"name", volName},
anyParam{"clientIDKey", clientIDKey},
))
}
func (api *AdminAPI) CreateDataPartition(volName string, count int, clientIDKey string) (err error) {
return api.mc.request(newRequest(get, proto.AdminCreateDataPartition).Header(api.h).Param(
anyParam{"name", volName},
anyParam{"count", count},
anyParam{"clientIDKey", clientIDKey},
))
}
func (api *AdminAPI) DecommissionDataPartition(dataPartitionID uint64, nodeAddr string, raftForce bool, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminDecommissionDataPartition).Header(api.h)
request.addParam("id", strconv.FormatUint(dataPartitionID, 10))
request.addParam("addr", nodeAddr)
request.addParam("raftForceDel", strconv.FormatBool(raftForce))
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) DecommissionMetaPartition(metaPartitionID uint64, nodeAddr, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminDecommissionMetaPartition).Header(api.h)
request.addParam("id", strconv.FormatUint(metaPartitionID, 10))
request.addParam("addr", nodeAddr)
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) DeleteDataReplica(dataPartitionID uint64, nodeAddr, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminDeleteDataReplica).Header(api.h)
request.addParam("id", strconv.FormatUint(dataPartitionID, 10))
request.addParam("addr", nodeAddr)
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) AddDataReplica(dataPartitionID uint64, nodeAddr, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminAddDataReplica).Header(api.h)
request.addParam("id", strconv.FormatUint(dataPartitionID, 10))
request.addParam("addr", nodeAddr)
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) DeleteMetaReplica(metaPartitionID uint64, nodeAddr string, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminDeleteMetaReplica).Header(api.h)
request.addParam("id", strconv.FormatUint(metaPartitionID, 10))
request.addParam("addr", nodeAddr)
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) AddMetaReplica(metaPartitionID uint64, nodeAddr string, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminAddMetaReplica).Header(api.h)
request.addParam("id", strconv.FormatUint(metaPartitionID, 10))
request.addParam("addr", nodeAddr)
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) DeleteVolume(volName, authKey string) (err error) {
request := newRequest(get, proto.AdminDeleteVol).Header(api.h)
request.addParam("name", volName)
request.addParam("authKey", authKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) DeleteVolumeWithAuthNode(volName, authKey, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminDeleteVol).Header(api.h)
request.addParam("name", volName)
request.addParam("authKey", authKey)
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) UpdateVolume(
vv *proto.SimpleVolView,
txTimeout int64,
txMask string,
txForceReset bool,
txConflictRetryNum int64,
txConflictRetryInterval int64,
txOpLimit int,
clientIDKey string,
) (err error) {
request := newRequest(get, proto.AdminUpdateVol).Header(api.h)
request.addParam("name", vv.Name)
request.addParam("description", vv.Description)
request.addParam("authKey", util.CalcAuthKey(vv.Owner))
request.addParam("zoneName", vv.ZoneName)
request.addParam("capacity", strconv.FormatUint(vv.Capacity, 10))
request.addParam("followerRead", strconv.FormatBool(vv.FollowerRead))
request.addParam("ebsBlkSize", strconv.Itoa(vv.ObjBlockSize))
request.addParam("cacheCap", strconv.FormatUint(vv.CacheCapacity, 10))
request.addParam("cacheAction", strconv.Itoa(vv.CacheAction))
request.addParam("cacheThreshold", strconv.Itoa(vv.CacheThreshold))
request.addParam("cacheTTL", strconv.Itoa(vv.CacheTtl))
request.addParam("cacheHighWater", strconv.Itoa(vv.CacheHighWater))
request.addParam("cacheLowWater", strconv.Itoa(vv.CacheLowWater))
request.addParam("cacheLRUInterval", strconv.Itoa(vv.CacheLruInterval))
request.addParam("cacheRuleKey", vv.CacheRule)
request.addParam("dpReadOnlyWhenVolFull", strconv.FormatBool(vv.DpReadOnlyWhenVolFull))
request.addParam("replicaNum", strconv.FormatUint(uint64(vv.DpReplicaNum), 10))
request.addParam("enableQuota", strconv.FormatBool(vv.EnableQuota))
request.addParam("deleteLockTime", strconv.FormatInt(vv.DeleteLockTime, 10))
request.addParam("clientIDKey", clientIDKey)
if txMask != "" {
request.addParam("enableTxMask", txMask)
request.addParam("txForceReset", strconv.FormatBool(txForceReset))
}
if txTimeout > 0 {
request.addParam("txTimeout", strconv.FormatInt(txTimeout, 10))
}
if txConflictRetryNum > 0 {
request.addParam("txConflictRetryNum", strconv.FormatInt(txConflictRetryNum, 10))
}
if txOpLimit > 0 {
request.addParam("txOpLimit", strconv.Itoa(txOpLimit))
}
if txConflictRetryInterval > 0 {
request.addParam("txConflictRetryInterval", strconv.FormatInt(txConflictRetryInterval, 10))
}
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) PutDataPartitions(volName string, dpsView []byte) (err error) {
return api.mc.request(newRequest(post, proto.AdminPutDataPartitions).
Header(api.h).addParam("name", volName).Body(dpsView))
}
func (api *AdminAPI) VolShrink(volName string, capacity uint64, authKey, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminVolShrink).Header(api.h)
request.addParam("name", volName)
request.addParam("authKey", authKey)
request.addParam("capacity", strconv.FormatUint(capacity, 10))
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) VolExpand(volName string, capacity uint64, authKey, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminVolExpand).Header(api.h)
request.addParam("name", volName)
request.addParam("authKey", authKey)
request.addParam("capacity", strconv.FormatUint(capacity, 10))
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) CreateVolName(volName, owner string, capacity uint64, deleteLockTime int64, crossZone, normalZonesFirst bool, business string,
mpCount, dpCount, replicaNum, dpSize, volType int, followerRead bool, zoneName, cacheRuleKey string, ebsBlkSize,
cacheCapacity, cacheAction, cacheThreshold, cacheTTL, cacheHighWater, cacheLowWater, cacheLRUInterval int,
dpReadOnlyWhenVolFull bool, txMask string, txTimeout uint32, txConflictRetryNum int64, txConflictRetryInterval int64, optEnableQuota string,
clientIDKey string,
) (err error) {
request := newRequest(get, proto.AdminCreateVol).Header(api.h)
request.addParam("name", volName)
request.addParam("owner", owner)
request.addParam("capacity", strconv.FormatUint(capacity, 10))
request.addParam("deleteLockTime", strconv.FormatInt(deleteLockTime, 10))
request.addParam("crossZone", strconv.FormatBool(crossZone))
request.addParam("normalZonesFirst", strconv.FormatBool(normalZonesFirst))
request.addParam("description", business)
request.addParam("mpCount", strconv.Itoa(mpCount))
request.addParam("dpCount", strconv.Itoa(dpCount))
request.addParam("replicaNum", strconv.Itoa(replicaNum))
request.addParam("dpSize", strconv.Itoa(dpSize))
request.addParam("volType", strconv.Itoa(volType))
request.addParam("followerRead", strconv.FormatBool(followerRead))
request.addParam("zoneName", zoneName)
request.addParam("cacheRuleKey", cacheRuleKey)
request.addParam("ebsBlkSize", strconv.Itoa(ebsBlkSize))
request.addParam("cacheCap", strconv.Itoa(cacheCapacity))
request.addParam("cacheAction", strconv.Itoa(cacheAction))
request.addParam("cacheThreshold", strconv.Itoa(cacheThreshold))
request.addParam("cacheTTL", strconv.Itoa(cacheTTL))
request.addParam("cacheHighWater", strconv.Itoa(cacheHighWater))
request.addParam("cacheLowWater", strconv.Itoa(cacheLowWater))
request.addParam("cacheLRUInterval", strconv.Itoa(cacheLRUInterval))
request.addParam("dpReadOnlyWhenVolFull", strconv.FormatBool(dpReadOnlyWhenVolFull))
request.addParam("enableQuota", optEnableQuota)
request.addParam("clientIDKey", clientIDKey)
if txMask != "" {
request.addParam("enableTxMask", txMask)
}
if txTimeout > 0 {
request.addParam("txTimeout", strconv.FormatUint(uint64(txTimeout), 10))
}
if txConflictRetryNum > 0 {
request.addParam("txConflictRetryNum", strconv.FormatInt(txConflictRetryNum, 10))
}
if txConflictRetryInterval > 0 {
request.addParam("txConflictRetryInterval", strconv.FormatInt(txConflictRetryInterval, 10))
}
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) CreateDefaultVolume(volName, owner string) (err error) {
request := newRequest(get, proto.AdminCreateVol).Header(api.h)
request.addParam("name", volName)
request.addParam("owner", owner)
request.addParam("capacity", "10")
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) GetVolumeSimpleInfo(volName string) (vv *proto.SimpleVolView, err error) {
vv = &proto.SimpleVolView{}
err = api.mc.requestWith(vv, newRequest(get, proto.AdminGetVol).Header(api.h).addParam("name", volName))
return
}
func (api *AdminAPI) SetVolumeForbidden(volName string, forbidden bool) (err error) {
request := newRequest(post, proto.AdminVolForbidden).Header(api.h)
request.addParam("name", volName)
request.addParam("forbidden", strconv.FormatBool(forbidden))
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) SetVolumeAuditLog(volName string, enable bool) (err error) {
request := newRequest(post, proto.AdminVolEnableAuditLog).Header(api.h)
request.addParam("name", volName)
request.addParam("enable", strconv.FormatBool(enable))
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) GetMonitorPushAddr() (addr string, err error) {
err = api.mc.requestWith(&addr, newRequest(get, proto.AdminGetMonitorPushAddr).Header(api.h))
return
}
func (api *AdminAPI) UploadFlowInfo(volName string, flowInfo *proto.ClientReportLimitInfo) (vv *proto.LimitRsp2Client, err error) {
if flowInfo == nil {
return nil, fmt.Errorf("flowinfo is nil")
}
vv = &proto.LimitRsp2Client{}
err = api.mc.requestWith(vv, newRequest(get, proto.QosUpload).Header(api.h).Body(flowInfo).
Param(anyParam{"name", volName}, anyParam{"qosEnable", "true"}))
log.LogInfof("action[UploadFlowInfo] enable %v", vv.Enable)
return
}
func (api *AdminAPI) GetVolumeSimpleInfoWithFlowInfo(volName string) (vv *proto.SimpleVolView, err error) {
vv = &proto.SimpleVolView{}
err = api.mc.requestWith(vv, newRequest(get, proto.AdminGetVol).
Header(api.h).Param(anyParam{"name", volName}, anyParam{"init", "true"}))
return
}
// access control list
func (api *AdminAPI) CheckACL() (ci *proto.ClusterInfo, err error) {
ci = &proto.ClusterInfo{}
err = api.mc.requestWith(ci, newRequest(get, proto.AdminACL).Header(api.h))
return
}
func (api *AdminAPI) GetClusterInfo() (ci *proto.ClusterInfo, err error) {
ci = &proto.ClusterInfo{}
err = api.mc.requestWith(ci, newRequest(get, proto.AdminGetIP).Header(api.h))
return
}
func (api *AdminAPI) GetVerInfo(volName string) (ci *proto.VolumeVerInfo, err error) {
ci = &proto.VolumeVerInfo{}
err = api.mc.requestWith(ci, newRequest(get, proto.AdminGetVolVer).
Header(api.h).addParam("name", volName))
return
}
func (api *AdminAPI) CreateMetaPartition(volName string, count int, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminCreateMetaPartition).Header(api.h)
request.addParam("name", volName)
request.addParam("count", strconv.Itoa(count))
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) ListVols(keywords string) (volsInfo []*proto.VolInfo, err error) {
volsInfo = make([]*proto.VolInfo, 0)
err = api.mc.requestWith(&volsInfo, newRequest(get, proto.AdminListVols).
Header(api.h).addParam("keywords", keywords))
return
}
func (api *AdminAPI) IsFreezeCluster(isFreeze bool, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminClusterFreeze).Header(api.h)
request.addParam("enable", strconv.FormatBool(isFreeze))
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) SetForbidMpDecommission(disable bool) (err error) {
request := newRequest(get, proto.AdminClusterForbidMpDecommission).Header(api.h)
request.addParam("enable", strconv.FormatBool(disable))
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) SetMetaNodeThreshold(threshold float64, clientIDKey string) (err error) {
request := newRequest(get, proto.AdminSetMetaNodeThreshold).Header(api.h)
request.addParam("threshold", strconv.FormatFloat(threshold, 'f', 6, 64))
request.addParam("clientIDKey", clientIDKey)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) SetClusterParas(batchCount, markDeleteRate, deleteWorkerSleepMs, autoRepairRate, loadFactor, maxDpCntLimit, clientIDKey string,
dataNodesetSelector, metaNodesetSelector, dataNodeSelector, metaNodeSelector string,
) (err error) {
request := newRequest(get, proto.AdminSetNodeInfo).Header(api.h)
request.addParam("batchCount", batchCount)
request.addParam("markDeleteRate", markDeleteRate)
request.addParam("deleteWorkerSleepMs", deleteWorkerSleepMs)
request.addParam("autoRepairRate", autoRepairRate)
request.addParam("loadFactor", loadFactor)
request.addParam("maxDpCntLimit", maxDpCntLimit)
request.addParam("clientIDKey", clientIDKey)
request.addParam("dataNodesetSelector", dataNodesetSelector)
request.addParam("metaNodesetSelector", metaNodesetSelector)
request.addParam("dataNodeSelector", dataNodeSelector)
request.addParam("metaNodeSelector", metaNodeSelector)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) GetClusterParas() (delParas map[string]string, err error) {
request := newRequest(get, proto.AdminGetNodeInfo).Header(api.h)
if _, err = api.mc.serveRequest(request); err != nil {
return
}
delParas = make(map[string]string)
err = api.mc.requestWith(&delParas, newRequest(get, proto.AdminGetNodeInfo).Header(api.h))
return
}
func (api *AdminAPI) CreatePreLoadDataPartition(volName string, count int, capacity, ttl uint64, zongs string) (view *proto.DataPartitionsView, err error) {
view = &proto.DataPartitionsView{}
err = api.mc.requestWith(view, newRequest(get, proto.AdminCreatePreLoadDataPartition).Header(api.h).Param(
anyParam{"name", volName},
anyParam{"replicaNum", count},
anyParam{"capacity", capacity},
anyParam{"cacheTTL", ttl},
anyParam{"zoneName", zongs},
))
return
}
func (api *AdminAPI) ListQuota(volName string) (quotaInfo []*proto.QuotaInfo, err error) {
resp := &proto.ListMasterQuotaResponse{}
if err = api.mc.requestWith(resp, newRequest(get, proto.QuotaList).
Header(api.h).addParam("name", volName)); err != nil {
log.LogErrorf("action[ListQuota] fail. %v", err)
return
}
quotaInfo = resp.Quotas
log.LogInfof("action[ListQuota] success.")
return quotaInfo, err
}
func (api *AdminAPI) CreateQuota(volName string, quotaPathInfos []proto.QuotaPathInfo, maxFiles uint64, maxBytes uint64) (quotaId uint32, err error) {
if err = api.mc.requestWith("aId, newRequest(get, proto.QuotaCreate).
Header(api.h).Body("aPathInfos).Param(
anyParam{"name", volName},
anyParam{"maxFiles", maxFiles},
anyParam{"maxBytes", maxBytes})); err != nil {
log.LogErrorf("action[CreateQuota] fail. %v", err)
return
}
log.LogInfof("action[CreateQuota] success.")
return
}
func (api *AdminAPI) UpdateQuota(volName string, quotaId string, maxFiles uint64, maxBytes uint64) (err error) {
request := newRequest(get, proto.QuotaUpdate).Header(api.h)
request.addParam("name", volName)
request.addParam("quotaId", quotaId)
request.addParam("maxFiles", strconv.FormatUint(maxFiles, 10))
request.addParam("maxBytes", strconv.FormatUint(maxBytes, 10))
if _, err = api.mc.serveRequest(request); err != nil {
log.LogErrorf("action[UpdateQuota] fail. %v", err)
return
}
log.LogInfof("action[UpdateQuota] success.")
return nil
}
func (api *AdminAPI) DeleteQuota(volName string, quotaId string) (err error) {
request := newRequest(get, proto.QuotaDelete).Header(api.h)
request.addParam("name", volName)
request.addParam("quotaId", quotaId)
if _, err = api.mc.serveRequest(request); err != nil {
log.LogErrorf("action[DeleteQuota] fail. %v", err)
return
}
log.LogInfo("action[DeleteQuota] success.")
return nil
}
func (api *AdminAPI) GetQuota(volName string, quotaId string) (quotaInfo *proto.QuotaInfo, err error) {
info := &proto.QuotaInfo{}
if err = api.mc.requestWith(info, newRequest(get, proto.QuotaGet).Header(api.h).
Param(anyParam{"name", volName}, anyParam{"quotaId", quotaId})); err != nil {
log.LogErrorf("action[GetQuota] fail. %v", err)
return
}
quotaInfo = info
log.LogInfof("action[GetQuota] %v success.", *quotaInfo)
return quotaInfo, err
}
func (api *AdminAPI) QueryBadDisks() (badDisks *proto.BadDiskInfos, err error) {
badDisks = &proto.BadDiskInfos{}
err = api.mc.requestWith(badDisks, newRequest(get, proto.QueryBadDisks).Header(api.h))
return
}
func (api *AdminAPI) DecommissionDisk(addr string, disk string) (err error) {
return api.mc.request(newRequest(post, proto.DecommissionDisk).Header(api.h).
addParam("addr", addr).addParam("disk", disk))
}
func (api *AdminAPI) RecommissionDisk(addr string, disk string) (err error) {
return api.mc.request(newRequest(post, proto.RecommissionDisk).Header(api.h).
addParam("addr", addr).addParam("disk", disk))
}
func (api *AdminAPI) QueryDecommissionDiskProgress(addr string, disk string) (progress *proto.DecommissionProgress, err error) {
progress = &proto.DecommissionProgress{}
err = api.mc.requestWith(progress, newRequest(post, proto.QueryDiskDecoProgress).
Header(api.h).Param(anyParam{"addr", addr}, anyParam{"disk", disk}))
return
}
func (api *AdminAPI) ListQuotaAll() (volsInfo []*proto.VolInfo, err error) {
volsInfo = make([]*proto.VolInfo, 0)
err = api.mc.requestWith(&volsInfo, newRequest(get, proto.QuotaListAll).Header(api.h))
return
}
func (api *AdminAPI) GetDiscardDataPartition() (discardDpInfos *proto.DiscardDataPartitionInfos, err error) {
discardDpInfos = &proto.DiscardDataPartitionInfos{}
err = api.mc.requestWith(&discardDpInfos, newRequest(get, proto.AdminGetDiscardDp).Header(api.h))
return
}
func (api *AdminAPI) SetDataPartitionDiscard(partitionId uint64, discard bool) (err error) {
request := newRequest(post, proto.AdminSetDpDiscard).
Header(api.h).
addParam("id", strconv.FormatUint(partitionId, 10)).
addParam("dpDiscard", strconv.FormatBool(discard))
if err = api.mc.request(request); err != nil {
return
}
return
}
func (api *AdminAPI) DeleteVersion(volName string, verSeq string) (err error) {
request := newRequest(get, proto.AdminDelVersion).Header(api.h)
request.addParam("name", volName)
request.addParam("verSeq", verSeq)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) SetStrategy(volName string, periodic string, count string, enable string, force string) (err error) {
request := newRequest(get, proto.AdminSetVerStrategy).Header(api.h)
request.addParam("name", volName)
request.addParam("periodic", periodic)
request.addParam("count", count)
request.addParam("enable", enable)
request.addParam("force", force)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) CreateVersion(volName string) (ver *proto.VolVersionInfo, err error) {
ver = &proto.VolVersionInfo{}
err = api.mc.requestWith(ver, newRequest(get, proto.AdminCreateVersion).
Header(api.h).addParam("name", volName))
return
}
func (api *AdminAPI) GetLatestVer(volName string) (ver *proto.VolVersionInfo, err error) {
ver = &proto.VolVersionInfo{}
err = api.mc.requestWith(ver, newRequest(get, proto.AdminGetVersionInfo).
Header(api.h).addParam("name", volName))
return
}
func (api *AdminAPI) GetVerList(volName string) (verList *proto.VolVersionInfoList, err error) {
verList = &proto.VolVersionInfoList{}
err = api.mc.requestWith(verList, newRequest(get, proto.AdminGetAllVersionInfo).
Header(api.h).addParam("name", volName))
log.LogDebugf("GetVerList. vol %v verList %v", volName, verList)
for _, info := range verList.VerList {
log.LogDebugf("GetVerList. vol %v verList %v", volName, info)
}
return
}
func (api *AdminAPI) SetBucketLifecycle(req *proto.LcConfiguration) (err error) {
return api.mc.request(newRequest(post, proto.SetBucketLifecycle).Header(api.h).Body(req))
}
func (api *AdminAPI) GetBucketLifecycle(volume string) (lcConf *proto.LcConfiguration, err error) {
lcConf = &proto.LcConfiguration{}
err = api.mc.requestWith(lcConf, newRequest(get, proto.GetBucketLifecycle).
Header(api.h).addParam("name", volume))
return
}
func (api *AdminAPI) DelBucketLifecycle(volume string) (err error) {
request := newRequest(get, proto.DeleteBucketLifecycle).Header(api.h)
request.addParam("name", volume)
_, err = api.mc.serveRequest(request)
return
}
func (api *AdminAPI) GetS3QoSInfo() (data []byte, err error) {
return api.mc.serveRequest(newRequest(get, proto.S3QoSGet).Header(api.h))
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"math/rand"
"github.com/cubefs/cubefs/proto"
)
type Decoder func([]byte) ([]byte, error)
func (d Decoder) Decode(raw []byte) ([]byte, error) {
return d(raw)
}
type ClientAPI struct {
mc *MasterClient
h map[string]string // extra headers
}
func (api *ClientAPI) WithHeader(key, val string) *ClientAPI {
return &ClientAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}
func (api *ClientAPI) EncodingWith(encoding string) *ClientAPI {
return api.WithHeader(headerAcceptEncoding, encoding)
}
func (api *ClientAPI) EncodingGzip() *ClientAPI {
return api.EncodingWith(encodingGzip)
}
func (api *ClientAPI) GetVolume(volName string, authKey string) (vv *proto.VolView, err error) {
vv = &proto.VolView{}
err = api.mc.requestWith(vv, newRequest(post, proto.ClientVol).
Header(api.h).Param(anyParam{"name", volName}, anyParam{"authKey", authKey}))
return
}
func (api *ClientAPI) GetVolumeWithoutAuthKey(volName string) (vv *proto.VolView, err error) {
vv = &proto.VolView{}
err = api.mc.requestWith(vv, newRequest(post, proto.ClientVol).
Header(api.h, proto.SkipOwnerValidation, "true").addParam("name", volName))
return
}
func (api *ClientAPI) GetVolumeWithAuthnode(volName string, authKey string, token string, decoder Decoder) (vv *proto.VolView, err error) {
var body []byte
request := newRequest(post, proto.ClientVol).Header(api.h)
request.addParam("name", volName)
request.addParam("authKey", authKey)
request.addParam(proto.ClientMessage, token)
if body, err = api.mc.serveRequest(request); err != nil {
return
}
if decoder != nil {
if body, err = decoder.Decode(body); err != nil {
return
}
}
vv = &proto.VolView{}
if err = json.Unmarshal(body, vv); err != nil {
return
}
return
}
func (api *ClientAPI) GetVolumeStat(volName string) (info *proto.VolStatInfo, err error) {
info = &proto.VolStatInfo{}
err = api.mc.requestWith(info, newRequest(get, proto.ClientVolStat).
Header(api.h).Param(anyParam{"name", volName}, anyParam{"version", proto.LFClient}))
return
}
func (api *ClientAPI) GetMetaPartition(partitionID uint64) (partition *proto.MetaPartitionInfo, err error) {
partition = &proto.MetaPartitionInfo{}
err = api.mc.requestWith(partition, newRequest(get, proto.ClientMetaPartition).
Header(api.h).addParamAny("id", partitionID))
return
}
func (api *ClientAPI) GetMetaPartitions(volName string) (views []*proto.MetaPartitionView, err error) {
views = make([]*proto.MetaPartitionView, 0)
err = api.mc.requestWith(&views, newRequest(get, proto.ClientMetaPartitions).
Header(api.h).addParam("name", volName))
return
}
func (api *ClientAPI) GetDataPartitions(volName string) (view *proto.DataPartitionsView, err error) {
request := newRequest(get, proto.ClientDataPartitions).Header(api.h).addParam("name", volName)
lastLeader := api.mc.leaderAddr
defer api.mc.SetLeader(lastLeader)
randIndex := rand.Intn(len(api.mc.masters))
if randIndex >= len(api.mc.masters) {
err = fmt.Errorf("master len %v less or equal request index %v", len(api.mc.masters), randIndex)
return
}
api.mc.SetLeader(api.mc.masters[randIndex])
var data []byte
if data, err = api.mc.serveRequest(request); err != nil {
return
}
view = &proto.DataPartitionsView{}
if err = json.Unmarshal(data, view); err != nil {
return
}
return
}
func (api *ClientAPI) GetPreLoadDataPartitions(volName string) (view *proto.DataPartitionsView, err error) {
view = &proto.DataPartitionsView{}
err = api.mc.requestWith(view, newRequest(get, proto.ClientDataPartitions).
Header(api.h).addParam("name", volName))
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"strconv"
"github.com/cubefs/cubefs/proto"
)
type NodeAPI struct {
mc *MasterClient
h map[string]string // extra headers
}
func (api *NodeAPI) WithHeader(key, val string) *NodeAPI {
return &NodeAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}
func (api *NodeAPI) EncodingWith(encoding string) *NodeAPI {
return api.WithHeader(headerAcceptEncoding, encoding)
}
func (api *NodeAPI) EncodingGzip() *NodeAPI {
return api.EncodingWith(encodingGzip)
}
func (api *NodeAPI) AddDataNode(serverAddr, zoneName string) (id uint64, err error) {
request := newRequest(get, proto.AddDataNode).Header(api.h)
request.addParam("addr", serverAddr)
request.addParam("zoneName", zoneName)
var data []byte
if data, err = api.mc.serveRequest(request); err != nil {
return
}
id, err = strconv.ParseUint(string(data), 10, 64)
return
}
func (api *NodeAPI) AddDataNodeWithAuthNode(serverAddr, zoneName, clientIDKey string) (id uint64, err error) {
request := newRequest(get, proto.AddDataNode).Header(api.h)
request.addParam("addr", serverAddr)
request.addParam("zoneName", zoneName)
request.addParam("clientIDKey", clientIDKey)
var data []byte
if data, err = api.mc.serveRequest(request); err != nil {
return
}
id, err = strconv.ParseUint(string(data), 10, 64)
return
}
func (api *NodeAPI) AddMetaNode(serverAddr, zoneName string) (id uint64, err error) {
request := newRequest(get, proto.AddMetaNode).Header(api.h)
request.addParam("addr", serverAddr)
request.addParam("zoneName", zoneName)
var data []byte
if data, err = api.mc.serveRequest(request); err != nil {
return
}
id, err = strconv.ParseUint(string(data), 10, 64)
return
}
func (api *NodeAPI) AddMetaNodeWithAuthNode(serverAddr, zoneName, clientIDKey string) (id uint64, err error) {
request := newRequest(get, proto.AddMetaNode).Header(api.h)
request.addParam("addr", serverAddr)
request.addParam("zoneName", zoneName)
request.addParam("clientIDKey", clientIDKey)
var data []byte
if data, err = api.mc.serveRequest(request); err != nil {
return
}
id, err = strconv.ParseUint(string(data), 10, 64)
return
}
func (api *NodeAPI) GetDataNode(serverHost string) (node *proto.DataNodeInfo, err error) {
node = &proto.DataNodeInfo{}
err = api.mc.requestWith(node, newRequest(get, proto.GetDataNode).Header(api.h).addParam("addr", serverHost))
return
}
func (api *NodeAPI) GetMetaNode(serverHost string) (node *proto.MetaNodeInfo, err error) {
node = &proto.MetaNodeInfo{}
err = api.mc.requestWith(node, newRequest(get, proto.GetMetaNode).Header(api.h).addParam("addr", serverHost))
return
}
func (api *NodeAPI) ResponseMetaNodeTask(task *proto.AdminTask) (err error) {
return api.mc.request(newRequest(post, proto.GetMetaNodeTaskResponse).Header(api.h).Body(task))
}
func (api *NodeAPI) ResponseDataNodeTask(task *proto.AdminTask) (err error) {
return api.mc.request(newRequest(post, proto.GetDataNodeTaskResponse).Header(api.h).Body(task))
}
func (api *NodeAPI) DataNodeDecommission(nodeAddr string, count int, clientIDKey string) (err error) {
request := newRequest(get, proto.DecommissionDataNode).Header(api.h).NoTimeout()
request.addParam("addr", nodeAddr)
request.addParam("count", strconv.Itoa(count))
request.addParam("clientIDKey", clientIDKey)
if _, err = api.mc.serveRequest(request); err != nil {
return
}
return
}
func (api *NodeAPI) MetaNodeDecommission(nodeAddr string, count int, clientIDKey string) (err error) {
request := newRequest(get, proto.DecommissionMetaNode).Header(api.h).NoTimeout()
request.addParam("addr", nodeAddr)
request.addParam("count", strconv.Itoa(count))
request.addParam("clientIDKey", clientIDKey)
if _, err = api.mc.serveRequest(request); err != nil {
return
}
return
}
func (api *NodeAPI) MetaNodeMigrate(srcAddr, targetAddr string, count int, clientIDKey string) (err error) {
request := newRequest(get, proto.MigrateMetaNode).Header(api.h).NoTimeout()
request.addParam("srcAddr", srcAddr)
request.addParam("targetAddr", targetAddr)
request.addParam("count", strconv.Itoa(count))
request.addParam("clientIDKey", clientIDKey)
if _, err = api.mc.serveRequest(request); err != nil {
return
}
return
}
func (api *NodeAPI) DataNodeMigrate(srcAddr, targetAddr string, count int, clientIDKey string) (err error) {
request := newRequest(get, proto.MigrateDataNode).Header(api.h).NoTimeout()
request.addParam("srcAddr", srcAddr)
request.addParam("targetAddr", targetAddr)
request.addParam("count", strconv.Itoa(count))
request.addParam("clientIDKey", clientIDKey)
if _, err = api.mc.serveRequest(request); err != nil {
return
}
return
}
func (api *NodeAPI) AddLcNode(serverAddr string) (id uint64, err error) {
request := newRequest(get, proto.AddLcNode).Header(api.h).addParam("addr", serverAddr)
var data []byte
if data, err = api.mc.serveRequest(request); err != nil {
return
}
id, err = strconv.ParseUint(string(data), 10, 64)
return
}
func (api *NodeAPI) ResponseLcNodeTask(task *proto.AdminTask) (err error) {
return api.mc.request(newRequest(post, proto.GetLcNodeTaskResponse).Header(api.h).Body(task))
}
package master
import (
"fmt"
"os"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/ump"
)
type UserAPI struct {
mc *MasterClient
h map[string]string // extra headers
}
func (api *UserAPI) WithHeader(key, val string) *UserAPI {
return &UserAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}
func (api *UserAPI) EncodingWith(encoding string) *UserAPI {
return api.WithHeader(headerAcceptEncoding, encoding)
}
func (api *UserAPI) EncodingGzip() *UserAPI {
return api.EncodingWith(encodingGzip)
}
func (api *UserAPI) CreateUser(param *proto.UserCreateParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(post, proto.UserCreate).
Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
return
}
func (api *UserAPI) DeleteUser(userID string, clientIDKey string) (err error) {
request := newRequest(post, proto.UserDelete).Header(api.h)
request.addParam("user", userID)
request.addParam("clientIDKey", clientIDKey)
if _, err = api.mc.serveRequest(request); err != nil {
return
}
return
}
func (api *UserAPI) UpdateUser(param *proto.UserUpdateParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(post, proto.UserUpdate).
Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
return
}
func (api *UserAPI) GetAKInfo(accesskey string) (userInfo *proto.UserInfo, err error) {
localIP, _ := ump.GetLocalIpAddr()
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(get, proto.UserGetAKInfo).
Header(api.h).Param(anyParam{"ak", accesskey}, anyParam{"ip", localIP}))
return
}
func (api *UserAPI) AclOperation(volName string, localIP string, op uint32) (aclInfo *proto.AclRsp, err error) {
aclInfo = &proto.AclRsp{}
if err = api.mc.requestWith(aclInfo, newRequest(get, proto.AdminACL).Header(api.h).Param(
anyParam{"name", volName},
anyParam{"ip", localIP},
anyParam{"op", op},
)); err != nil {
fmt.Fprintf(os.Stdout, "AclOperation err %v\n", err)
return
}
return
}
func (api *UserAPI) UidOperation(volName string, uid string, op uint32, val string) (uidInfo *proto.UidSpaceRsp, err error) {
uidInfo = &proto.UidSpaceRsp{}
if err = api.mc.requestWith(uidInfo, newRequest(get, proto.AdminUid).Header(api.h).Param(
anyParam{"name", volName},
anyParam{"uid", uid},
anyParam{"op", op},
anyParam{"capacity", val},
)); err != nil {
fmt.Fprintf(os.Stdout, "UidOperation err %v\n", err)
return
}
return
}
func (api *UserAPI) GetUserInfo(userID string) (userInfo *proto.UserInfo, err error) {
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(get, proto.UserGetInfo).Header(api.h).addParam("user", userID))
return
}
func (api *UserAPI) UpdatePolicy(param *proto.UserPermUpdateParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(post, proto.UserUpdatePolicy).
Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
return
}
func (api *UserAPI) RemovePolicy(param *proto.UserPermRemoveParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(post, proto.UserRemovePolicy).
Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
return
}
func (api *UserAPI) DeleteVolPolicy(vol, clientIDKey string) (err error) {
return api.mc.request(newRequest(post, proto.UserDeleteVolPolicy).Header(api.h).
addParam("name", vol).addParam("clientIDKey", clientIDKey))
}
func (api *UserAPI) TransferVol(param *proto.UserTransferVolParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
userInfo = &proto.UserInfo{}
err = api.mc.requestWith(userInfo, newRequest(post, proto.UserTransferVol).
Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
return
}
func (api *UserAPI) ListUsers(keywords string) (users []*proto.UserInfo, err error) {
users = make([]*proto.UserInfo, 0)
err = api.mc.requestWith(&users, newRequest(get, proto.UserList).Header(api.h).addParam("keywords", keywords))
return
}
func (api *UserAPI) ListUsersOfVol(vol string) (users []string, err error) {
users = make([]string, 0)
err = api.mc.requestWith(&users, newRequest(get, proto.UsersOfVol).Header(api.h).addParam("name", vol))
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/compressor"
"github.com/cubefs/cubefs/util/log"
)
// TODO: re-use response body.
const (
requestTimeout = 30 * time.Second
encodingGzip = compressor.EncodingGzip
headerAcceptEncoding = proto.HeaderAcceptEncoding
headerContentEncoding = proto.HeaderContentEncoding
get = http.MethodGet
post = http.MethodPost
)
var ErrNoValidMaster = errors.New("no valid master")
type MasterCLientWithResolver struct {
MasterClient
resolver *NameResolver
updateInverval int
stopC chan struct{}
}
type MasterClient struct {
sync.RWMutex
masters []string
useSSL bool
leaderAddr string
timeout time.Duration
clientIDKey string
adminAPI *AdminAPI
clientAPI *ClientAPI
nodeAPI *NodeAPI
userAPI *UserAPI
}
func (c *MasterClient) ReplaceMasterAddresses(addrs []string) {
c.Lock()
defer c.Unlock()
c.masters = addrs
c.leaderAddr = ""
}
// AddNode add the given address as the master address.
func (c *MasterClient) AddNode(address string) {
c.Lock()
c.updateMaster(address)
c.Unlock()
}
// Leader returns the current leader address.
func (c *MasterClient) Leader() (addr string) {
c.RLock()
addr = c.leaderAddr
c.RUnlock()
return
}
func (c *MasterClient) ClientIDKey() string {
return c.clientIDKey
}
func (c *MasterClient) AdminAPI() *AdminAPI {
return c.adminAPI
}
func (c *MasterClient) ClientAPI() *ClientAPI {
return c.clientAPI
}
func (c *MasterClient) NodeAPI() *NodeAPI {
return c.nodeAPI
}
func (c *MasterClient) UserAPI() *UserAPI {
return c.userAPI
}
// Change the leader address.
func (c *MasterClient) SetLeader(addr string) {
c.Lock()
c.leaderAddr = addr
c.Unlock()
}
// Change the request timeout
func (c *MasterClient) SetTimeout(timeout uint16) {
c.Lock()
c.timeout = time.Duration(timeout) * time.Second
c.Unlock()
}
func (c *MasterClient) SetClientIDKey(clientIDKey string) {
c.Lock()
c.clientIDKey = clientIDKey
c.Unlock()
}
func (c *MasterClient) serveRequest(r *request) (repsData []byte, err error) {
leaderAddr, nodes := c.prepareRequest()
host := leaderAddr
for i := -1; i < len(nodes); i++ {
if i == -1 {
if host == "" {
continue
}
} else {
host = nodes[i]
}
var resp *http.Response
schema := "http"
if c.useSSL {
schema = "https"
}
url := fmt.Sprintf("%s://%s%s", schema, host, r.path)
resp, err = c.httpRequest(r.method, url, r)
if err != nil {
log.LogErrorf("serveRequest: send http request fail: method(%v) url(%v) err(%v)", r.method, url, err)
continue
}
stateCode := resp.StatusCode
repsData, err = io.ReadAll(resp.Body)
_ = resp.Body.Close()
if err != nil {
log.LogErrorf("serveRequest: read http response body fail: err(%v)", err)
continue
}
switch stateCode {
case http.StatusForbidden:
curMasterAddr := strings.TrimSpace(string(repsData))
curMasterAddr = strings.Replace(curMasterAddr, "\n", "", -1)
if len(curMasterAddr) == 0 {
log.LogWarnf("serveRequest: server response status 403: request(%s) status"+
"(403), body is empty", host)
err = ErrNoValidMaster
return
}
repsData, err = c.serveRequest(r)
return
case http.StatusOK:
if leaderAddr != host {
log.LogDebugf("server Request resp new master[%v] old [%v]", host, leaderAddr)
c.SetLeader(host)
}
repsData, err = compressor.New(resp.Header.Get(headerContentEncoding)).Decompress(repsData)
if err != nil {
log.LogErrorf("serveRequest: decompress response body fail: err(%v)", err)
return nil, fmt.Errorf("decompress response body err:%v", err)
}
body := new(proto.HTTPReplyRaw)
if err := body.Unmarshal(repsData); err != nil {
log.LogErrorf("unmarshal response body err:%v", err)
return nil, fmt.Errorf("unmarshal response body err:%v", err)
}
if body.Code != proto.ErrCodeSuccess {
log.LogWarnf("serveRequest: code[%v], msg[%v], data[%v] ", body.Code, body.Msg, body.Data)
if body.Code == proto.ErrCodeInternalError && len(body.Msg) > 0 {
return nil, errors.New(body.Msg)
}
return nil, proto.ParseErrorCode(body.Code)
}
return body.Bytes(), nil
default:
msg := fmt.Sprintf("serveRequest: unknown status: host(%v) uri(%v) status(%v) body(%s).",
resp.Request.URL.String(), host, stateCode, strings.Replace(string(repsData), "\n", "", -1))
err = errors.New(msg)
log.LogErrorf(msg)
continue
}
}
return
}
func (c *MasterClient) requestWith(rst interface{}, r *request) error {
if r.err != nil {
return r.err
}
buf, err := c.serveRequest(r)
if err != nil {
return err
}
if rst == nil {
return nil
}
return json.Unmarshal(buf, rst)
}
// result is nil
func (c *MasterClient) request(r *request) error {
return c.requestWith(nil, r)
}
// Nodes returns all master addresses.
func (c *MasterClient) Nodes() (nodes []string) {
c.RLock()
nodes = c.masters
c.RUnlock()
return
}
// prepareRequest returns the leader address and all master addresses.
func (c *MasterClient) prepareRequest() (addr string, nodes []string) {
c.RLock()
addr = c.leaderAddr
nodes = c.masters
c.RUnlock()
return
}
func (c *MasterClient) httpRequest(method, url string, r *request) (resp *http.Response, err error) {
client := http.DefaultClient
if !r.noTimeout {
client.Timeout = c.timeout
}
reader := bytes.NewReader(r.body)
var req *http.Request
fullUrl := c.mergeRequestUrl(url, r.params)
log.LogDebugf("httpRequest: method(%v) url(%v) bodyLength[%v].", method, fullUrl, len(r.body))
if req, err = http.NewRequest(method, fullUrl, reader); err != nil {
return
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Connection", "close")
for k, v := range r.header {
req.Header.Set(k, v)
}
resp, err = client.Do(req)
return
}
func (c *MasterClient) updateMaster(address string) {
contains := false
for _, master := range c.masters {
if master == address {
contains = true
break
}
}
if !contains {
c.masters = append(c.masters, address)
}
c.leaderAddr = address
}
func (c *MasterClient) mergeRequestUrl(url string, params map[string]string) string {
if len(params) > 0 {
buff := bytes.NewBuffer([]byte(url))
isFirstParam := true
for k, v := range params {
if isFirstParam {
buff.WriteString("?")
isFirstParam = false
} else {
buff.WriteString("&")
}
buff.WriteString(k)
buff.WriteString("=")
buff.WriteString(v)
}
return buff.String()
}
return url
}
func NewMasterCLientWithResolver(masters []string, useSSL bool, updateInverval int) *MasterCLientWithResolver {
mc := &MasterCLientWithResolver{
MasterClient: MasterClient{masters: masters, useSSL: useSSL, timeout: requestTimeout},
updateInverval: updateInverval,
stopC: make(chan struct{}),
}
mc.adminAPI = &AdminAPI{mc: &mc.MasterClient}
mc.clientAPI = &ClientAPI{mc: &mc.MasterClient}
mc.nodeAPI = &NodeAPI{mc: &mc.MasterClient}
mc.userAPI = &UserAPI{mc: &mc.MasterClient}
resolver, err := NewNameResolver(masters)
if err != nil {
return nil
} else {
mc.resolver = resolver
}
return mc
}
func (mc *MasterCLientWithResolver) Start() (err error) {
failed := true
for i := 0; i < 3; i++ {
var changed bool
changed, err = mc.resolver.Resolve()
if changed && err == nil {
var addrs []string
addrs, err = mc.resolver.GetAllAddresses()
if err == nil {
mc.ReplaceMasterAddresses(addrs)
failed = false
break
} else {
log.LogWarnf("MasterCLientWithResolver: Resolve failed: %v, retry %v", err, i)
}
}
}
if failed {
err = errors.New("MasterCLientWithResolver: Resolve failed")
log.LogErrorf("MasterCLientWithResolver: Resolve failed")
return
}
if len(mc.resolver.domains) == 0 {
log.LogDebugf("MasterCLientWithResolver: No domains found, skipping resolving timely")
return
}
go func() {
ticker := time.NewTicker(time.Duration(mc.updateInverval) * time.Minute)
// timer := time.NewTimer(0)
defer ticker.Stop()
for {
select {
case <-mc.stopC:
log.LogInfo("MasterCLientWithResolver goroutine stopped")
return
case <-ticker.C:
changed, err := mc.resolver.Resolve()
if changed && err == nil {
addrs, err := mc.resolver.GetAllAddresses()
if err == nil {
mc.ReplaceMasterAddresses(addrs)
}
}
// timer.Reset(time.Duration(mc.updateInverval) * time.Minute)
}
}
}()
return nil
}
func (mc *MasterCLientWithResolver) Stop() {
select {
case mc.stopC <- struct{}{}:
log.LogDebugf("stop resolver, notified!")
default:
log.LogDebugf("stop resolver, skipping notify!")
}
}
// NewMasterHelper returns a new MasterClient instance.
func NewMasterClient(masters []string, useSSL bool) *MasterClient {
mc := &MasterClient{masters: masters, useSSL: useSSL, timeout: requestTimeout}
mc.adminAPI = &AdminAPI{mc: mc}
mc.clientAPI = &ClientAPI{mc: mc}
mc.nodeAPI = &NodeAPI{mc: mc}
mc.userAPI = &UserAPI{mc: mc}
return mc
}
// NewMasterClientFromString parse raw master address configuration
// string and returns a new MasterClient instance.
// Notes that a valid format raw string must match: "{HOST}:{PORT},{HOST}:{PORT}"
func NewMasterClientFromString(masterAddr string, useSSL bool) *MasterClient {
masters := make([]string, 0)
for _, master := range strings.Split(masterAddr, ",") {
master = strings.TrimSpace(master)
if master != "" {
masters = append(masters, master)
}
}
return NewMasterClient(masters, useSSL)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"errors"
"fmt"
"math/rand"
"net"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/util/log"
)
var domainRegexp = regexp.MustCompile(`^(?i)[a-z0-9-]+(\.[a-z0-9-]+)+\.?$`)
func IsValidDomain(domain string) bool {
return domainRegexp.MatchString(domain)
}
type IpCache struct {
sync.RWMutex
Ts int64 // time.Now().Unix()
Ips []string
}
func (ic *IpCache) SetIps(ips []string) {
ic.Lock()
defer ic.Unlock()
ic.Ips = ips
ic.Ts = time.Now().Unix()
}
func (ic *IpCache) UpdateTs() {
ic.Lock()
defer ic.Unlock()
ic.Ts = time.Now().Unix()
}
func (ic *IpCache) GetRandomIp() (ip string, err error) {
ic.RLock()
defer ic.RUnlock()
if len(ic.Ips) == 0 {
return "", fmt.Errorf("ip cache is empty")
}
randIndex := rand.Intn(len(ic.Ips))
return ic.Ips[randIndex], nil
}
func (ic *IpCache) GetAllIps() (ips []string, err error) {
ic.RLock()
defer ic.RUnlock()
if len(ic.Ips) == 0 {
return nil, fmt.Errorf("ip cache is empty")
}
return ic.Ips, nil
}
type NameResolver struct {
domains []string
ips []string
port uint64
ic *IpCache
}
// NewNameResolver parse raw master address configuration
// string and returns a new NameResolver instance.
// Notes that a valid format raw string member of addrs must match: "IP:PORT" or "DOMAIN:PORT"
// and PORT must be the same
func NewNameResolver(addrPorts []string) (ns *NameResolver, err error) {
if len(addrPorts) == 0 {
log.LogErrorf("NameResolver: empty addresses for name resolver")
return nil, fmt.Errorf("empty addresses for name resolver")
}
var domains []string
var ips []string
port := uint64(0)
for _, ap := range addrPorts {
if ap == "" {
continue
}
arr := strings.Split(ap, ":")
/*if len(arr) != 2 {
return nil, fmt.Errorf("wrong addr format [%v]", ap)
}*/
arrNum := len(arr)
p := uint64(0)
if arrNum == 2 {
p, err = strconv.ParseUint(arr[1], 10, 64)
if err != nil {
log.LogErrorf("NameResolver: wrong addr format [%v]", ap)
return nil, fmt.Errorf("wrong addr format [%v]", ap)
}
} else if arrNum == 1 {
p = 80
} else {
log.LogErrorf("NameResolver: wrong addr format [%v]", ap)
return nil, fmt.Errorf("wrong addr format [%v]", ap)
}
if port == 0 {
port = p
} else if port != p {
log.LogErrorf("NameResolver: ports are not the same")
return nil, fmt.Errorf("ports are not the same")
}
addr := net.ParseIP(arr[0])
if addr == nil {
if IsValidDomain(arr[0]) {
domains = append(domains, arr[0])
} else {
log.LogErrorf("NameResolver: wrong addr format [%v]", ap)
return nil, fmt.Errorf("wrong addr format [%v]", ap)
}
} else {
ips = append(ips, addr.String())
}
}
ic := &IpCache{}
ns = &NameResolver{
domains: domains,
ips: ips,
port: port,
ic: ic,
}
log.LogDebugf("NameResolver: add ip[%v], domain[%v], port[%v]", ips, domains, port)
return ns, nil
}
func (ns *NameResolver) GetRandomIp() (ip string, err error) {
return ns.ic.GetRandomIp()
}
func (ns *NameResolver) GetAllIps() (ips []string, err error) {
return ns.ic.GetAllIps()
}
func (ns *NameResolver) GetAllAddresses() (addrs []string, err error) {
ips, err := ns.ic.GetAllIps()
if err != nil {
return nil, err
}
for _, ip := range ips {
addr := fmt.Sprintf("%s:%d", ip, ns.port)
addrs = append(addrs, addr)
}
return addrs, nil
}
func (ns *NameResolver) isChanged(ipSet map[string]struct{}) (changed bool) {
for _, ip := range ns.ic.Ips {
if _, ok := ipSet[ip]; !ok {
changed = true
}
}
if !changed {
if len(ipSet) != len(ns.ic.Ips) {
changed = true
}
}
return
}
func (ns *NameResolver) Resolve() (changed bool, err error) {
if len(ns.ips) == 0 && len(ns.domains) == 0 {
return false, fmt.Errorf("name or ip empty")
}
ipSet := make(map[string]struct{}, 0)
if len(ns.domains) > 0 {
var addrs []net.IP
for _, domain := range ns.domains {
addrs, err = net.LookupIP(domain)
if err != nil {
log.LogWarnf("domain [%v] resolved failed", domain)
continue
} else {
for _, ip := range addrs {
ipSet[ip.String()] = struct{}{}
}
}
}
}
for _, ip := range ns.ips {
ipSet[ip] = struct{}{}
}
if len(ipSet) == 0 {
return false, errors.New("resolve: resolving result is empty")
}
var ips []string
for ip := range ipSet {
ips = append(ips, ip)
}
changed = ns.isChanged(ipSet)
if changed {
log.LogInfof("Resolve: resolving result is changed from %v to %v", ns.ic.Ips, ips)
ns.ic.SetIps(ips)
} else {
log.LogDebugf("Resolve: resolving result is not changed %v", ns.ic.Ips)
}
ns.ic.UpdateTs()
return changed, nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package master
import (
"encoding/json"
"fmt"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
)
type request struct {
method string
path string
params map[string]string
header map[string]string
body []byte
err error
noTimeout bool
}
type anyParam struct {
key string
val interface{}
}
var ReqHeaderUA = fmt.Sprintf("cubefs-sdk/%v (commit %v)", proto.Version, proto.CommitID)
func (r *request) addParamAny(key string, value interface{}) *request {
r.params[key] = util.Any2String(value)
return r
}
func (r *request) addParam(key, value string) *request {
r.params[key] = value
return r
}
func (r *request) addHeader(key, value string) *request {
r.header[key] = value
return r
}
func (r *request) setBody(body []byte) *request {
r.body = body
return r
}
func (r *request) Param(params ...anyParam) *request {
for _, param := range params {
r.addParamAny(param.key, param.val)
}
return r
}
func (r *request) Header(headers map[string]string, added ...string) *request {
if len(added)%2 == 1 {
added = added[:len(added)-1]
}
for k, v := range headers {
r.header[k] = v
}
for idx := 0; idx < len(added); idx += 2 {
r.header[added[idx]] = added[idx+1]
}
return r
}
func (r *request) Body(body interface{}) *request {
reqBody, ok := body.([]byte)
if !ok {
var err error
if reqBody, err = json.Marshal(body); err != nil {
r.err = fmt.Errorf("body json marshal %s", err.Error())
return r
}
}
r.body = reqBody
return r
}
func (r *request) NoTimeout() *request {
r.noTimeout = true
return r
}
func newRequest(method string, path string) *request {
req := &request{
method: method,
path: path,
params: make(map[string]string),
header: make(map[string]string),
}
req.header["User-Agent"] = ReqHeaderUA
return req
}
func mergeHeader(headers map[string]string, added ...string) map[string]string {
if len(added)%2 == 1 {
added = added[:len(added)-1]
}
copied := make(map[string]string, len(headers)+len(added)/2)
for k, v := range headers {
copied[k] = v
}
for idx := 0; idx < len(added); idx += 2 {
copied[added[idx]] = added[idx+1]
}
return copied
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
"errors"
"fmt"
syslog "log"
"math"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
// Low-level API, i.e. work with inode
const (
OpenRetryInterval = 5 * time.Millisecond
OpenRetryLimit = 1000
maxUniqID = 5000
)
const (
BatchIgetRespBuf = 1000
MaxSummaryGoroutineNum = 120
BatchGetBufLen = 500
UpdateSummaryRetry = 3
SummaryKey = "DirStat"
ChannelLen = 100
BatchSize = 200
MaxGoroutineNum = 5
InodeFullMaxRetryTime = 2
ForceUpdateRWMP = "ForceUpdateRWMP"
)
func mapHaveSameKeys(m1, m2 map[uint32]*proto.MetaQuotaInfo) bool {
if len(m1) != len(m2) {
return false
}
for k := range m1 {
if _, ok := m2[k]; !ok {
return false
}
}
return true
}
func (mw *MetaWrapper) GetRootIno(subdir string) (uint64, error) {
rootIno, err := mw.LookupPath(subdir)
if err != nil {
return 0, fmt.Errorf("GetRootIno: Lookup failed, subdir(%v) err(%v)", subdir, err)
}
info, err := mw.InodeGet_ll(rootIno)
if err != nil {
return 0, fmt.Errorf("GetRootIno: InodeGet failed, subdir(%v) err(%v)", subdir, err)
}
if !proto.IsDir(info.Mode) {
return 0, fmt.Errorf("GetRootIno: not directory, subdir(%v) mode(%v) err(%v)", subdir, info.Mode, err)
}
syslog.Printf("GetRootIno: %v\n", rootIno)
return rootIno, nil
}
// Looks up absolute path and returns the ino
func (mw *MetaWrapper) LookupPath(subdir string) (uint64, error) {
ino := proto.RootIno
if subdir == "" || subdir == "/" {
return ino, nil
}
dirs := strings.Split(subdir, "/")
for _, dir := range dirs {
if dir == "/" || dir == "" {
continue
}
child, _, err := mw.Lookup_ll(ino, dir)
if err != nil {
return 0, err
}
ino = child
}
return ino, nil
}
func (mw *MetaWrapper) Statfs() (total, used, inodeCount uint64) {
total = atomic.LoadUint64(&mw.totalSize)
used = atomic.LoadUint64(&mw.usedSize)
inodeCount = atomic.LoadUint64(&mw.inodeCount)
return
}
func (mw *MetaWrapper) Create_ll(parentID uint64, name string, mode, uid, gid uint32, target []byte, fullPath string) (*proto.InodeInfo, error) {
// if mw.EnableTransaction {
txMask := proto.TxOpMaskOff
if proto.IsRegular(mode) {
txMask = proto.TxOpMaskCreate
} else if proto.IsDir(mode) {
txMask = proto.TxOpMaskMkdir
} else if proto.IsSymlink(mode) {
txMask = proto.TxOpMaskSymlink
} else {
txMask = proto.TxOpMaskMknod
}
txType := proto.TxMaskToType(txMask)
if mw.enableTx(txMask) && txType != proto.TxTypeUndefined {
return mw.txCreate_ll(parentID, name, mode, uid, gid, target, txType, fullPath)
} else {
return mw.create_ll(parentID, name, mode, uid, gid, target, fullPath)
}
}
func (mw *MetaWrapper) txCreate_ll(parentID uint64, name string, mode, uid, gid uint32, target []byte, txType uint32, fullPath string) (info *proto.InodeInfo, err error) {
var (
status int
// err error
// info *proto.InodeInfo
mp *MetaPartition
rwPartitions []*MetaPartition
)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("txCreate_ll: No parent partition, parentID(%v)", parentID)
return nil, syscall.ENOENT
}
var quotaIds []uint32
if mw.EnableQuota {
quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
if err != nil {
log.LogErrorf("Create_ll: get parent quota fail, parentID(%v) err(%v)", parentID, err)
return nil, syscall.ENOENT
}
for quotaId := range quotaInfos {
quotaIds = append(quotaIds, quotaId)
}
}
rwPartitions = mw.getRWPartitions()
length := len(rwPartitions)
var tx *Transaction
defer func() {
if tx != nil {
err = tx.OnDone(err, mw)
}
}()
epoch := atomic.AddUint64(&mw.epoch, 1)
for i := 0; i < length; i++ {
index := (int(epoch) + i) % length
mp = rwPartitions[index]
tx, err = NewCreateTransaction(parentMP, mp, parentID, name, mw.TxTimeout, txType)
if err != nil {
return nil, syscall.EAGAIN
}
status, info, err = mw.txIcreate(tx, mp, mode, uid, gid, target, quotaIds, fullPath)
if err == nil && status == statusOK {
goto create_dentry
} else if status == statusNoSpace {
log.LogErrorf("Create_ll status %v", status)
return nil, statusToErrno(status)
} else {
// sync cancel previous transaction before retry
tx.Rollback(mw)
}
}
return nil, syscall.ENOMEM
create_dentry:
if log.EnableDebug() {
log.LogDebugf("txCreate_ll: tx.txInfo(%v)", tx.txInfo)
}
status, err = mw.txDcreate(tx, parentMP, parentID, name, info.Inode, mode, quotaIds, fullPath)
if err != nil || status != statusOK {
return nil, statusErrToErrno(status, err)
}
if log.EnableDebug() {
log.LogDebugf("txCreate_ll: tx.txInfo(%v)", tx.txInfo)
}
if mw.EnableSummary {
var filesInc, dirsInc int64
if proto.IsDir(mode) {
dirsInc = 1
} else {
filesInc = 1
}
// go mw.UpdateSummary_ll(parentID, filesInc, dirsInc, 0)
job := func() {
mw.UpdateSummary_ll(parentID, filesInc, dirsInc, 0)
}
tx.SetOnCommit(job)
}
return info, nil
}
func (mw *MetaWrapper) create_ll(parentID uint64, name string, mode, uid, gid uint32, target []byte, fullPath string) (*proto.InodeInfo, error) {
var (
status int
err error
info *proto.InodeInfo
mp *MetaPartition
rwPartitions []*MetaPartition
)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("Create_ll: No parent partition, parentID(%v)", parentID)
return nil, syscall.ENOENT
}
status, info, err = mw.iget(parentMP, parentID, mw.LastVerSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
quota := atomic.LoadUint32(&mw.DirChildrenNumLimit)
if info.Nlink >= quota {
log.LogErrorf("Create_ll: parent inode's nlink quota reached, parentID(%v)", parentID)
return nil, syscall.EDQUOT
}
get_rwmp:
rwPartitions = mw.getRWPartitions()
length := len(rwPartitions)
epoch := atomic.AddUint64(&mw.epoch, 1)
retryTime := 0
var quotaIds []uint32
if mw.EnableQuota {
quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
if err != nil {
log.LogErrorf("Create_ll: get parent quota fail, parentID(%v) err(%v)", parentID, err)
return nil, syscall.ENOENT
}
for quotaId := range quotaInfos {
quotaIds = append(quotaIds, quotaId)
}
for i := 0; i < length; i++ {
index := (int(epoch) + i) % length
mp = rwPartitions[index]
status, info, err = mw.quotaIcreate(mp, mode, uid, gid, target, quotaIds, fullPath)
if err == nil && status == statusOK {
goto create_dentry
} else if status == statusFull {
if retryTime >= InodeFullMaxRetryTime {
break
}
retryTime++
log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
mw.triggerAndWaitForceUpdate()
return nil, nil
})
goto get_rwmp
} else if status == statusNoSpace {
log.LogErrorf("Create_ll status %v", status)
return nil, statusToErrno(status)
}
}
} else {
for i := 0; i < length; i++ {
index := (int(epoch) + i) % length
mp = rwPartitions[index]
status, info, err = mw.icreate(mp, mode, uid, gid, target, fullPath)
if err == nil && status == statusOK {
goto create_dentry
} else if status == statusFull {
if retryTime >= InodeFullMaxRetryTime {
break
}
retryTime++
log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
mw.triggerAndWaitForceUpdate()
return nil, nil
})
goto get_rwmp
} else if status == statusNoSpace {
log.LogErrorf("Create_ll status %v", status)
return nil, statusToErrno(status)
}
}
}
return nil, syscall.ENOMEM
create_dentry:
if mw.EnableQuota {
status, err = mw.quotaDcreate(parentMP, parentID, name, info.Inode, mode, quotaIds, fullPath)
} else {
status, err = mw.dcreate(parentMP, parentID, name, info.Inode, mode, fullPath)
}
if err != nil {
if status == statusOpDirQuota || status == statusNoSpace {
mw.iunlink(mp, info.Inode, mw.Client.GetLatestVer(), 0, fullPath)
mw.ievict(mp, info.Inode, fullPath)
}
return nil, statusToErrno(status)
} else if status != statusOK {
if status != statusExist {
mw.iunlink(mp, info.Inode, mw.Client.GetLatestVer(), 0, fullPath)
mw.ievict(mp, info.Inode, fullPath)
}
return nil, statusToErrno(status)
}
if mw.EnableSummary {
var filesInc, dirsInc int64
if proto.IsDir(mode) {
dirsInc = 1
} else {
filesInc = 1
}
go mw.UpdateSummary_ll(parentID, filesInc, dirsInc, 0)
}
return info, nil
}
func (mw *MetaWrapper) Lookup_ll(parentID uint64, name string) (inode uint64, mode uint32, err error) {
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("Lookup_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
return 0, 0, syscall.ENOENT
}
status, inode, mode, err := mw.lookup(parentMP, parentID, name, mw.VerReadSeq)
if err != nil || status != statusOK {
return 0, 0, statusToErrno(status)
}
return inode, mode, nil
}
func (mw *MetaWrapper) BatchGetExpiredMultipart(prefix string, days int) (expiredIds []*proto.ExpiredMultipartInfo, err error) {
partitions := mw.partitions
var mp *MetaPartition
wg := new(sync.WaitGroup)
var resultMu sync.Mutex
log.LogDebugf("BatchGetExpiredMultipart: mp num(%v) prefix(%v) days(%v)", len(partitions), prefix, days)
for _, mp = range partitions {
wg.Add(1)
go func(mp *MetaPartition) {
defer wg.Done()
status, infos, err := mw.getExpiredMultipart(prefix, days, mp)
if err == nil && status == statusOK {
resultMu.Lock()
expiredIds = append(expiredIds, infos...)
resultMu.Unlock()
}
if err != nil && err != syscall.ENOENT {
log.LogErrorf("batchGetExpiredMultipart: get expired multipart fail: partitionId(%v)",
mp.PartitionID)
}
}(mp)
}
wg.Wait()
resultMu.Lock()
defer resultMu.Unlock()
if len(expiredIds) == 0 {
err = syscall.ENOENT
return
}
return
}
func (mw *MetaWrapper) InodeGet_ll(inode uint64) (*proto.InodeInfo, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeGet_ll: No such partition, ino(%v)", inode)
return nil, syscall.ENOENT
}
status, info, err := mw.iget(mp, inode, mw.VerReadSeq)
if err != nil || status != statusOK {
if status == statusNoent {
// For NOENT error, pull the latest mp and give it another try,
// in case the mp view is outdated.
mw.triggerAndWaitForceUpdate()
return mw.doInodeGet(inode)
}
return nil, statusToErrno(status)
}
if mw.EnableQuota {
if len(info.QuotaInfos) != 0 && proto.IsDir(info.Mode) {
var qinfo QuotaCacheInfo
qinfo.quotaInfos = make(map[uint32]*proto.MetaQuotaInfo)
qinfo.quotaInfos = info.QuotaInfos
qinfo.inode = inode
mw.qc.Put(inode, &qinfo)
}
}
log.LogDebugf("InodeGet_ll: info(%v)", info)
return info, nil
}
// Just like InodeGet but without retry
func (mw *MetaWrapper) doInodeGet(inode uint64) (*proto.InodeInfo, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeGet_ll: No such partition, ino(%v)", inode)
return nil, syscall.ENOENT
}
status, info, err := mw.iget(mp, inode, mw.VerReadSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
log.LogDebugf("doInodeGet: info(%v)", info)
return info, nil
}
func (mw *MetaWrapper) BatchInodeGet(inodes []uint64) []*proto.InodeInfo {
var wg sync.WaitGroup
batchInfos := make([]*proto.InodeInfo, 0)
resp := make(chan []*proto.InodeInfo, BatchIgetRespBuf)
candidates := make(map[uint64][]uint64)
// Target partition does not have to be very accurate.
for _, ino := range inodes {
mp := mw.getPartitionByInode(ino)
if mp == nil {
continue
}
if _, ok := candidates[mp.PartitionID]; !ok {
candidates[mp.PartitionID] = make([]uint64, 0, 256)
}
candidates[mp.PartitionID] = append(candidates[mp.PartitionID], ino)
}
for id, inos := range candidates {
mp := mw.getPartitionByID(id)
if mp == nil {
continue
}
wg.Add(1)
go mw.batchIget(&wg, mp, inos, resp)
}
go func() {
wg.Wait()
close(resp)
}()
for infos := range resp {
batchInfos = append(batchInfos, infos...)
}
log.LogDebugf("BatchInodeGet: inodesCnt(%d)", len(inodes))
return batchInfos
}
// InodeDelete_ll is a low-level api that removes specified inode immediately
// and do not effect extent data managed by this inode.
func (mw *MetaWrapper) InodeDelete_ll(inode uint64, fullPath string) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeDelete: No such partition, ino(%v)", inode)
return syscall.ENOENT
}
status, err := mw.idelete(mp, inode, fullPath)
if err != nil || status != statusOK {
return statusToErrno(status)
}
log.LogDebugf("InodeDelete_ll: inode(%v)", inode)
return nil
}
func (mw *MetaWrapper) BatchGetXAttr(inodes []uint64, keys []string) ([]*proto.XAttrInfo, error) {
// Collect meta partitions
var (
mps = make(map[uint64]*MetaPartition) // Mapping: partition ID -> partition
mpInodes = make(map[uint64][]uint64) // Mapping: partition ID -> inodes
)
for _, ino := range inodes {
mp := mw.getPartitionByInode(ino)
if mp != nil {
mps[mp.PartitionID] = mp
mpInodes[mp.PartitionID] = append(mpInodes[mp.PartitionID], ino)
}
}
var (
xattrsCh = make(chan *proto.XAttrInfo, len(inodes))
errorsCh = make(chan error, len(inodes))
)
var wg sync.WaitGroup
for pID := range mps {
wg.Add(1)
go func(mp *MetaPartition, inodes []uint64, keys []string) {
defer wg.Done()
xattrs, err := mw.batchGetXAttr(mp, inodes, keys)
if err != nil {
errorsCh <- err
log.LogErrorf("BatchGetXAttr: get xattr fail: volume(%v) partitionID(%v) inodes(%v) keys(%v) err(%s)",
mw.volname, mp.PartitionID, inodes, keys, err)
return
}
for _, info := range xattrs {
xattrsCh <- info
}
}(mps[pID], mpInodes[pID], keys)
}
wg.Wait()
close(xattrsCh)
close(errorsCh)
if len(errorsCh) > 0 {
return nil, <-errorsCh
}
xattrs := make([]*proto.XAttrInfo, 0, len(inodes))
for {
info := <-xattrsCh
if info == nil {
break
}
xattrs = append(xattrs, info)
}
return xattrs, nil
}
func (mw *MetaWrapper) Delete_ll(parentID uint64, name string, isDir bool, fullPath string) (*proto.InodeInfo, error) {
if mw.enableTx(proto.TxOpMaskRemove) {
return mw.txDelete_ll(parentID, name, isDir, fullPath)
} else {
return mw.Delete_ll_EX(parentID, name, isDir, 0, fullPath)
}
}
func (mw *MetaWrapper) Delete_Ver_ll(parentID uint64, name string, isDir bool, verSeq uint64, fullPath string) (*proto.InodeInfo, error) {
if verSeq == 0 {
verSeq = math.MaxUint64
}
log.LogDebugf("Delete_Ver_ll.parentId %v name %v isDir %v verSeq %v", parentID, name, isDir, verSeq)
return mw.Delete_ll_EX(parentID, name, isDir, verSeq, fullPath)
}
func (mw *MetaWrapper) DeleteWithCond_ll(parentID, cond uint64, name string, isDir bool, fullPath string) (*proto.InodeInfo, error) {
return mw.deletewithcond_ll(parentID, cond, name, isDir, fullPath)
}
func (mw *MetaWrapper) txDelete_ll(parentID uint64, name string, isDir bool, fullPath string) (info *proto.InodeInfo, err error) {
var (
status int
inode uint64
mode uint32
mp *MetaPartition
)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("txDelete_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
return nil, syscall.ENOENT
}
var tx *Transaction
defer func() {
if tx != nil {
err = tx.OnDone(err, mw)
}
}()
status, inode, mode, err = mw.lookup(parentMP, parentID, name, mw.LastVerSeq)
if err != nil || status != statusOK {
return nil, statusErrToErrno(status, err)
}
mp = mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("txDelete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
return nil, syscall.EINVAL
}
if isDir && !proto.IsDir(mode) {
return nil, syscall.EINVAL
}
if isDir && mw.EnableQuota {
quotaInfos, err := mw.GetInodeQuota_ll(inode)
if err != nil {
log.LogErrorf("get inode [%v] quota failed [%v]", inode, err)
return nil, syscall.ENOENT
}
for _, info := range quotaInfos {
if info.RootInode {
log.LogErrorf("can not remove quota Root inode equal inode [%v]", inode)
return nil, syscall.EACCES
}
}
}
tx, err = NewDeleteTransaction(parentMP, parentID, name, mp, inode, mw.TxTimeout)
if err != nil {
return nil, syscall.EAGAIN
}
status, err = mw.txCreateTX(tx, parentMP)
if status != statusOK || err != nil {
return nil, statusErrToErrno(status, err)
}
funcs := make([]func() (int, error), 0)
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, _, newErr = mw.txDdelete(tx, parentMP, parentID, inode, name, fullPath)
return newSt, newErr
})
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, info, newErr = mw.txIunlink(tx, mp, inode, fullPath)
return newSt, newErr
})
// 2. prepare transaction
var preErr error
wg := sync.WaitGroup{}
for _, fc := range funcs {
wg.Add(1)
go func(f func() (int, error)) {
defer wg.Done()
tStatus, tErr := f()
if tStatus != statusOK || tErr != nil {
preErr = statusErrToErrno(tStatus, tErr)
}
}(fc)
}
wg.Wait()
if preErr != nil {
return info, preErr
}
if mw.EnableSummary {
var job func()
// go func() {
if proto.IsDir(mode) {
job = func() {
mw.UpdateSummary_ll(parentID, 0, -1, 0)
}
} else {
job = func() {
mw.UpdateSummary_ll(parentID, -1, 0, -int64(info.Size))
}
}
tx.SetOnCommit(job)
}
return info, preErr
}
/*
* Note that the return value of InodeInfo might be nil without error,
* and the caller should make sure InodeInfo is valid before using it.
*/
func (mw *MetaWrapper) Delete_ll_EX(parentID uint64, name string, isDir bool, verSeq uint64, fullPath string) (*proto.InodeInfo, error) {
var (
status int
inode uint64
mode uint32
err error
info *proto.InodeInfo
mp *MetaPartition
inodeCreateTime int64
denVer uint64
)
log.LogDebugf("action[Delete_ll_EX] name %v verSeq %v parentID %v isDir %v", name, verSeq, parentID, isDir)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("delete_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
return nil, syscall.ENOENT
}
if isDir {
status, inode, mode, err = mw.lookup(parentMP, parentID, name, verSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
if !proto.IsDir(mode) {
return nil, syscall.EINVAL
}
if verSeq == 0 {
mp = mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("Delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
return nil, syscall.EAGAIN
}
status, info, err = mw.iget(mp, inode, verSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
if info == nil || info.Nlink > 2 {
return nil, syscall.ENOTEMPTY
}
}
if mw.EnableQuota {
quotaInfos, err := mw.GetInodeQuota_ll(inode)
if err != nil {
log.LogErrorf("get inode [%v] quota failed [%v]", inode, err)
return nil, syscall.ENOENT
}
for _, info := range quotaInfos {
if info.RootInode {
log.LogErrorf("can not remove quota Root inode equal inode [%v]", inode)
return nil, syscall.EACCES
}
}
mw.qc.Delete(inode)
}
if mw.volDeleteLockTime > 0 {
inodeCreateTime = info.CreateTime.Unix()
if ok, err := mw.canDeleteInode(mp, info, inode); !ok {
return nil, err
}
}
} else {
if mw.volDeleteLockTime > 0 {
status, inode, _, err = mw.lookup(parentMP, parentID, name, verSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
mp = mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
return nil, syscall.EAGAIN
}
status, info, err = mw.iget(mp, inode, verSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
inodeCreateTime = info.CreateTime.Unix()
if ok, err := mw.canDeleteInode(mp, info, inode); !ok {
return nil, err
}
}
}
log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
status, inode, _, err = mw.ddelete(parentMP, parentID, name, inodeCreateTime, verSeq, fullPath)
if err != nil || status != statusOK {
if status == statusNoent {
log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
return nil, nil
}
log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
return nil, statusToErrno(status)
}
log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
// dentry is deleted successfully but inode is not, still returns success.
mp = mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
return nil, nil
}
log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
status, info, err = mw.iunlink(mp, inode, verSeq, denVer, fullPath)
if err != nil || status != statusOK {
log.LogDebugf("action[Delete_ll] parentID %v inode %v name %v verSeq %v err %v", parentID, inode, name, verSeq, err)
return nil, nil
}
if verSeq == 0 && mw.EnableSummary {
go func() {
if proto.IsDir(mode) {
mw.UpdateSummary_ll(parentID, 0, -1, 0)
} else {
mw.UpdateSummary_ll(parentID, -1, 0, -int64(info.Size))
}
}()
}
return info, nil
}
func isObjectLocked(mw *MetaWrapper, inode uint64, name string) error {
xattrInfo, err := mw.XAttrGet_ll(inode, "oss:lock")
if err != nil {
log.LogErrorf("isObjectLocked: check ObjectLock err(%v) name(%v)", err, name)
return err
}
retainUntilDate := xattrInfo.Get("oss:lock")
if len(retainUntilDate) > 0 {
retainUntilDateInt64, err := strconv.ParseInt(string(retainUntilDate), 10, 64)
if err != nil {
return err
}
if retainUntilDateInt64 > time.Now().UnixNano() {
log.LogWarnf("isObjectLocked: object is locked, retainUntilDate(%v) name(%v)", retainUntilDateInt64, name)
return errors.New("Access Denied")
}
}
return nil
}
func (mw *MetaWrapper) deletewithcond_ll(parentID, cond uint64, name string, isDir bool, fullPath string) (*proto.InodeInfo, error) {
err := isObjectLocked(mw, cond, name)
if err != nil {
return nil, err
}
var (
status int
inode uint64
mode uint32
info *proto.InodeInfo
mp *MetaPartition
)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("delete_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
return nil, syscall.ENOENT
}
if isDir {
status, inode, mode, err = mw.lookup(parentMP, parentID, name, mw.LastVerSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
if !proto.IsDir(mode) {
return nil, syscall.EINVAL
}
mp = mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
return nil, syscall.EAGAIN
}
status, info, err = mw.iget(mp, inode, mw.VerReadSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
if info == nil || info.Nlink > 2 {
return nil, syscall.ENOTEMPTY
}
quotaInfos, err := mw.GetInodeQuota_ll(inode)
if err != nil {
log.LogErrorf("get inode [%v] quota failed [%v]", inode, err)
return nil, syscall.ENOENT
}
for _, info := range quotaInfos {
if info.RootInode {
log.LogErrorf("can not remove quota Root inode equal inode [%v]", inode)
return nil, syscall.EACCES
}
}
}
dentry := []proto.Dentry{
{
Name: name,
Inode: cond,
Type: mode,
},
}
status, resp, err := mw.ddeletes(parentMP, parentID, dentry, []string{fullPath})
if err != nil || status != statusOK {
if status == statusNoent {
return nil, nil
}
return nil, statusToErrno(status)
}
status = parseStatus(resp.Items[0].Status)
if status != statusOK {
if status == statusNoent {
return nil, nil
}
return nil, statusToErrno(status)
}
mp = mw.getPartitionByInode(resp.Items[0].Inode)
if mp == nil {
log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
return nil, nil
}
status, info, err = mw.iunlink(mp, resp.Items[0].Inode, 0, 0, fullPath)
if err != nil || status != statusOK {
return nil, nil
}
if mw.EnableSummary {
go func() {
if proto.IsDir(mode) {
mw.UpdateSummary_ll(parentID, 0, -1, 0)
} else {
mw.UpdateSummary_ll(parentID, -1, 0, -int64(info.Size))
}
}()
}
return info, nil
}
func (mw *MetaWrapper) Rename_ll(srcParentID uint64, srcName string, dstParentID uint64, dstName string, srcFullPath string, dstFullPath string, overwritten bool) (err error) {
if mw.enableTx(proto.TxOpMaskRename) {
return mw.txRename_ll(srcParentID, srcName, dstParentID, dstName, srcFullPath, dstFullPath, overwritten)
} else {
return mw.rename_ll(srcParentID, srcName, dstParentID, dstName, srcFullPath, dstFullPath, overwritten)
}
}
func (mw *MetaWrapper) txRename_ll(srcParentID uint64, srcName string, dstParentID uint64, dstName string, srcFullPath string, dstFullPath string, overwritten bool) (err error) {
var tx *Transaction
defer func() {
if tx != nil {
err = tx.OnDone(err, mw)
}
}()
srcParentMP := mw.getPartitionByInode(srcParentID)
if srcParentMP == nil {
return syscall.ENOENT
}
dstParentMP := mw.getPartitionByInode(dstParentID)
if dstParentMP == nil {
return syscall.ENOENT
}
// look up for the src ino
status, srcInode, srcMode, err := mw.lookup(srcParentMP, srcParentID, srcName, mw.LastVerSeq)
if err != nil || status != statusOK {
return statusToErrno(status)
}
tx, err = NewRenameTransaction(srcParentMP, srcParentID, srcName, dstParentMP, dstParentID, dstName, mw.TxTimeout)
if err != nil {
return syscall.EAGAIN
}
funcs := make([]func() (int, error), 0)
status, dstInode, dstMode, err := mw.lookup(dstParentMP, dstParentID, dstName, mw.LastVerSeq)
if err == nil && status == statusOK {
// Note that only regular files are allowed to be overwritten.
if !proto.IsRegular(dstMode) || !overwritten || !proto.IsRegular(srcMode) {
return syscall.EEXIST
}
oldInodeMP := mw.getPartitionByInode(dstInode)
if oldInodeMP == nil {
return syscall.EAGAIN
}
err = RenameTxReplaceInode(tx, oldInodeMP, dstInode)
if err != nil {
return syscall.EAGAIN
}
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, _, newErr = mw.txDupdate(tx, dstParentMP, dstParentID, dstName, srcInode, dstInode, dstFullPath)
return newSt, newErr
})
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, _, newErr = mw.txIunlink(tx, oldInodeMP, dstInode, dstFullPath)
if newSt == statusNoent {
return statusOK, nil
}
return newSt, newErr
})
if log.EnableDebug() {
log.LogDebugf("txRename_ll: tx(%v), pid:%v, name:%v, old(ino:%v) is replaced by src(new ino:%v)",
tx.txInfo, dstParentID, dstName, dstInode, srcInode)
}
} else if status == statusNoent {
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, newErr = mw.txDcreate(tx, dstParentMP, dstParentID, dstName, srcInode, srcMode, []uint32{}, dstFullPath)
return newSt, newErr
})
} else {
return statusToErrno(status)
}
// var inode uint64
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, _, newErr = mw.txDdelete(tx, srcParentMP, srcParentID, srcInode, srcName, srcFullPath)
return newSt, newErr
})
if log.EnableDebug() {
log.LogDebugf("txRename_ll: tx(%v), pid:%v, name:%v, old(ino:%v) is replaced by src(new ino:%v)",
tx.txInfo, dstParentID, dstName, dstInode, srcInode)
}
// 1. create transaction
status, err = mw.txCreateTX(tx, dstParentMP)
if status != statusOK || err != nil {
return statusErrToErrno(status, err)
}
// 2. prepare transaction
var preErr error
wg := sync.WaitGroup{}
for _, fc := range funcs {
wg.Add(1)
go func(f func() (int, error)) {
defer wg.Done()
tStatus, tErr := f()
if tStatus != statusOK || tErr != nil {
preErr = statusErrToErrno(tStatus, tErr)
}
}(fc)
}
wg.Wait()
if preErr != nil {
return preErr
}
// update summary
var job func()
if mw.EnableSummary {
var srcInodeInfo *proto.InodeInfo
var dstInodeInfo *proto.InodeInfo
srcInodeInfo, _ = mw.InodeGet_ll(srcInode)
if dstInode != 0 {
dstInodeInfo, _ = mw.InodeGet_ll(dstInode)
sizeInc := srcInodeInfo.Size - dstInodeInfo.Size
job = func() {
mw.UpdateSummary_ll(srcParentID, -1, 0, -int64(srcInodeInfo.Size))
mw.UpdateSummary_ll(dstParentID, 0, 0, int64(sizeInc))
}
tx.SetOnCommit(job)
return
} else {
sizeInc := int64(srcInodeInfo.Size)
if proto.IsRegular(srcMode) {
log.LogDebugf("txRename_ll: update summary when file dentry is replaced")
job = func() {
mw.UpdateSummary_ll(srcParentID, -1, 0, -sizeInc)
mw.UpdateSummary_ll(dstParentID, 1, 0, sizeInc)
}
} else {
log.LogDebugf("txRename_ll: update summary when dir dentry is replaced")
job = func() {
mw.UpdateSummary_ll(srcParentID, 0, -1, 0)
mw.UpdateSummary_ll(dstParentID, 0, 1, 0)
}
}
tx.SetOnCommit(job)
}
}
// TODO
// job = func() {
// var inodes []uint64
// inodes = append(inodes, srcInode)
// srcQuotaInfos, err := mw.GetInodeQuota_ll(srcParentID)
// if err != nil {
// log.LogErrorf("rename_ll get src parent inode [%v] quota fail [%v]", srcParentID, err)
// }
// destQuotaInfos, err := mw.getInodeQuota(dstParentMP, dstParentID)
// if err != nil {
// log.LogErrorf("rename_ll: get dst partent inode [%v] quota fail [%v]", dstParentID, err)
// }
// if mapHaveSameKeys(srcQuotaInfos, destQuotaInfos) {
// return
// }
// for quotaId := range srcQuotaInfos {
// mw.BatchDeleteInodeQuota_ll(inodes, quotaId)
// }
// for quotaId, info := range destQuotaInfos {
// log.LogDebugf("BatchSetInodeQuota_ll inodes [%v] quotaId [%v] rootInode [%v]", inodes, quotaId, info.RootInode)
// mw.BatchSetInodeQuota_ll(inodes, quotaId, false)
// }
// }
// tx.SetOnCommit(job)
return nil
}
func (mw *MetaWrapper) rename_ll(srcParentID uint64, srcName string, dstParentID uint64, dstName string, srcFullPath string, dstFullPath string, overwritten bool) (err error) {
var (
oldInode uint64
lastVerSeq uint64
)
srcParentMP := mw.getPartitionByInode(srcParentID)
if srcParentMP == nil {
return syscall.ENOENT
}
dstParentMP := mw.getPartitionByInode(dstParentID)
if dstParentMP == nil {
return syscall.ENOENT
}
status, info, err := mw.iget(dstParentMP, dstParentID, mw.VerReadSeq)
if err != nil || status != statusOK {
return statusToErrno(status)
}
quota := atomic.LoadUint32(&mw.DirChildrenNumLimit)
if info.Nlink >= quota {
log.LogErrorf("rename_ll: dst parent inode's nlink quota reached, parentID(%v)", dstParentID)
return syscall.EDQUOT
}
// look up for the src ino
status, inode, mode, err := mw.lookup(srcParentMP, srcParentID, srcName, mw.VerReadSeq)
if err != nil || status != statusOK {
return statusToErrno(status)
}
srcMP := mw.getPartitionByInode(inode)
if srcMP == nil {
return syscall.ENOENT
}
status, _, err = mw.ilink(srcMP, inode, srcFullPath)
if err != nil || status != statusOK {
return statusToErrno(status)
}
// create dentry in dst parent
status, err = mw.dcreate(dstParentMP, dstParentID, dstName, inode, mode, dstFullPath)
if err != nil {
if status == statusOpDirQuota {
return statusToErrno(status)
}
return syscall.EAGAIN
}
var srcInodeInfo *proto.InodeInfo
var dstInodeInfo *proto.InodeInfo
if mw.EnableSummary {
srcInodeInfo, _ = mw.InodeGet_ll(inode)
}
// Note that only regular files are allowed to be overwritten.
if status == statusExist && (proto.IsSymlink(mode) || proto.IsRegular(mode)) {
if !overwritten {
return syscall.EEXIST
}
status, oldInode, err = mw.dupdate(dstParentMP, dstParentID, dstName, inode, dstFullPath)
if err != nil {
return syscall.EAGAIN
}
if mw.EnableSummary {
dstInodeInfo, _ = mw.InodeGet_ll(oldInode)
}
}
if status != statusOK {
mw.iunlink(srcMP, inode, lastVerSeq, 0, srcFullPath)
return statusToErrno(status)
}
var denVer uint64
// delete dentry from src parent
status, _, denVer, err = mw.ddelete(srcParentMP, srcParentID, srcName, 0, lastVerSeq, srcFullPath)
if err != nil {
log.LogErrorf("mw.ddelete(srcParentMP, srcParentID, %s) failed.", srcName)
return statusToErrno(status)
} else if status != statusOK {
var (
sts int
e error
)
if oldInode == 0 {
sts, inode, denVer, e = mw.ddelete(dstParentMP, dstParentID, dstName, 0, lastVerSeq, dstFullPath)
} else {
sts, denVer, e = mw.dupdate(dstParentMP, dstParentID, dstName, oldInode, dstFullPath)
}
if e == nil && sts == statusOK {
mw.iunlink(srcMP, inode, lastVerSeq, denVer, srcFullPath)
}
return statusToErrno(status)
}
mw.iunlink(srcMP, inode, lastVerSeq, denVer, srcFullPath)
if oldInode != 0 {
// overwritten
inodeMP := mw.getPartitionByInode(oldInode)
if inodeMP != nil {
mw.iunlink(inodeMP, oldInode, lastVerSeq, 0, dstFullPath)
// evict oldInode to avoid oldInode becomes orphan inode
mw.ievict(inodeMP, oldInode, dstFullPath)
}
if mw.EnableSummary {
sizeInc := srcInodeInfo.Size - dstInodeInfo.Size
go func() {
mw.UpdateSummary_ll(srcParentID, -1, 0, -int64(srcInodeInfo.Size))
mw.UpdateSummary_ll(dstParentID, 0, 0, int64(sizeInc))
}()
}
} else {
if mw.EnableSummary {
sizeInc := int64(srcInodeInfo.Size)
if proto.IsRegular(mode) {
// file
go func() {
mw.UpdateSummary_ll(srcParentID, -1, 0, -sizeInc)
mw.UpdateSummary_ll(dstParentID, 1, 0, sizeInc)
}()
} else {
// dir
go func() {
mw.UpdateSummary_ll(srcParentID, 0, -1, 0)
mw.UpdateSummary_ll(dstParentID, 0, 1, 0)
}()
}
}
}
// TODO
// var inodes []uint64
// inodes = append(inodes, inode)
// srcQuotaInfos, err := mw.GetInodeQuota_ll(srcParentID)
// if err != nil {
// log.LogErrorf("rename_ll get src parent inode [%v] quota fail [%v]", srcParentID, err)
// }
// destQuotaInfos, err := mw.getInodeQuota(dstParentMP, dstParentID)
// if err != nil {
// log.LogErrorf("rename_ll: get dst partent inode [%v] quota fail [%v]", dstParentID, err)
// }
// if mapHaveSameKeys(srcQuotaInfos, destQuotaInfos) {
// return nil
// }
// for quotaId := range srcQuotaInfos {
// mw.BatchDeleteInodeQuota_ll(inodes, quotaId)
// }
// for quotaId, info := range destQuotaInfos {
// log.LogDebugf("BatchSetInodeQuota_ll inodes [%v] quotaId [%v] rootInode [%v]", inodes, quotaId, info.RootInode)
// mw.BatchSetInodeQuota_ll(inodes, quotaId, false)
// }
return nil
}
// Read all dentries with parentID
func (mw *MetaWrapper) ReadDir_ll(parentID uint64) ([]proto.Dentry, error) {
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
return nil, syscall.ENOENT
}
status, children, err := mw.readDir(parentMP, parentID)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
return children, nil
}
// Read limit count dentries with parentID, start from string
func (mw *MetaWrapper) ReadDirLimitForSnapShotClean(parentID uint64, from string, limit uint64, verSeq uint64, idDir bool) ([]proto.Dentry, error) {
if verSeq == 0 {
verSeq = math.MaxUint64
}
log.LogDebugf("action[ReadDirLimit_ll] parentID %v from %v limit %v verSeq %v", parentID, from, limit, verSeq)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
return nil, syscall.ENOENT
}
var opt uint8
opt |= uint8(proto.FlagsSnapshotDel)
if idDir {
opt |= uint8(proto.FlagsSnapshotDelDir)
}
status, children, err := mw.readDirLimit(parentMP, parentID, from, limit, verSeq, opt)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
for _, den := range children {
log.LogDebugf("ReadDirLimitForSnapShotClean. get dentry %v", den)
}
return children, nil
}
// Read limit count dentries with parentID, start from string
func (mw *MetaWrapper) ReadDirLimit_ll(parentID uint64, from string, limit uint64) ([]proto.Dentry, error) {
log.LogDebugf("action[ReadDirLimit_ll] parentID %v from %v limit %v", parentID, from, limit)
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
return nil, syscall.ENOENT
}
status, children, err := mw.readDirLimit(parentMP, parentID, from, limit, mw.VerReadSeq, 0)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
return children, nil
}
func (mw *MetaWrapper) DentryCreate_ll(parentID uint64, name string, inode uint64, mode uint32, fullPath string) error {
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
return syscall.ENOENT
}
var err error
var status int
if status, err = mw.dcreate(parentMP, parentID, name, inode, mode, fullPath); err != nil || status != statusOK {
return statusToErrno(status)
}
return nil
}
func (mw *MetaWrapper) DentryUpdate_ll(parentID uint64, name string, inode uint64, fullPath string) (oldInode uint64, err error) {
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
err = syscall.ENOENT
return
}
var status int
status, oldInode, err = mw.dupdate(parentMP, parentID, name, inode, fullPath)
if err != nil || status != statusOK {
err = statusToErrno(status)
return
}
return
}
func (mw *MetaWrapper) SplitExtentKey(parentInode, inode uint64, ek proto.ExtentKey) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
return syscall.ENOENT
}
var oldInfo *proto.InodeInfo
if mw.EnableSummary {
oldInfo, _ = mw.InodeGet_ll(inode)
}
status, err := mw.appendExtentKey(mp, inode, ek, nil, true)
if err != nil || status != statusOK {
log.LogErrorf("SplitExtentKey: inode(%v) ek(%v) err(%v) status(%v)", inode, ek, err, status)
return statusToErrno(status)
}
log.LogDebugf("SplitExtentKey: ino(%v) ek(%v)", inode, ek)
if mw.EnableSummary {
go func() {
newInfo, _ := mw.InodeGet_ll(inode)
if oldInfo != nil && newInfo != nil {
if int64(oldInfo.Size) < int64(newInfo.Size) {
mw.UpdateSummary_ll(parentInode, 0, 0, int64(newInfo.Size)-int64(oldInfo.Size))
}
}
}()
}
return nil
}
// Used as a callback by stream sdk
func (mw *MetaWrapper) AppendExtentKey(parentInode, inode uint64, ek proto.ExtentKey, discard []proto.ExtentKey) (int, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
return statusError, syscall.ENOENT
}
var oldInfo *proto.InodeInfo
if mw.EnableSummary {
oldInfo, _ = mw.InodeGet_ll(inode)
}
status, err := mw.appendExtentKey(mp, inode, ek, discard, false)
if err != nil || status != statusOK {
log.LogErrorf("MetaWrapper AppendExtentKey: inode(%v) ek(%v) local discard(%v) err(%v) status(%v)", inode, ek, discard, err, status)
return status, statusToErrno(status)
}
log.LogDebugf("MetaWrapper AppendExtentKey: ino(%v) ek(%v) discard(%v)", inode, ek, discard)
if mw.EnableSummary {
go func() {
newInfo, _ := mw.InodeGet_ll(inode)
if oldInfo != nil && newInfo != nil {
if int64(oldInfo.Size) < int64(newInfo.Size) {
mw.UpdateSummary_ll(parentInode, 0, 0, int64(newInfo.Size)-int64(oldInfo.Size))
}
}
}()
}
return statusOK, nil
}
// AppendExtentKeys append multiple extent key into specified inode with single request.
func (mw *MetaWrapper) AppendExtentKeys(inode uint64, eks []proto.ExtentKey) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
return syscall.ENOENT
}
status, err := mw.appendExtentKeys(mp, inode, eks)
if err != nil || status != statusOK {
log.LogErrorf("AppendExtentKeys: inode(%v) extentKeys(%v) err(%v) status(%v)", inode, eks, err, status)
return statusToErrno(status)
}
log.LogDebugf("AppendExtentKeys: ino(%v) extentKeys(%v)", inode, eks)
return nil
}
// AppendObjExtentKeys append multiple obj extent key into specified inode with single request.
func (mw *MetaWrapper) AppendObjExtentKeys(inode uint64, eks []proto.ObjExtentKey) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
return syscall.ENOENT
}
status, err := mw.appendObjExtentKeys(mp, inode, eks)
if err != nil || status != statusOK {
log.LogErrorf("AppendObjExtentKeys: inode(%v) objextentKeys(%v) err(%v) status(%v)", inode, eks, err, status)
return statusToErrno(status)
}
log.LogDebugf("AppendObjExtentKeys: ino(%v) objextentKeys(%v)", inode, eks)
return nil
}
func (mw *MetaWrapper) GetExtents(inode uint64) (gen uint64, size uint64, extents []proto.ExtentKey, err error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
return 0, 0, nil, syscall.ENOENT
}
resp, err := mw.getExtents(mp, inode)
if err != nil {
if resp != nil {
err = statusToErrno(resp.Status)
}
log.LogErrorf("GetExtents: ino(%v) err(%v)", inode, err)
return 0, 0, nil, err
}
extents = resp.Extents
gen = resp.Generation
size = resp.Size
// log.LogDebugf("GetObjExtents stack[%v]", string(debug.Stack()))
log.LogDebugf("GetExtents: ino(%v) gen(%v) size(%v) extents len (%v)", inode, gen, size, len(extents))
return gen, size, extents, nil
}
func (mw *MetaWrapper) GetObjExtents(inode uint64) (gen uint64, size uint64, extents []proto.ExtentKey, objExtents []proto.ObjExtentKey, err error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
return 0, 0, nil, nil, syscall.ENOENT
}
status, gen, size, extents, objExtents, err := mw.getObjExtents(mp, inode)
if err != nil || status != statusOK {
log.LogErrorf("GetObjExtents: ino(%v) err(%v) status(%v)", inode, err, status)
return 0, 0, nil, nil, statusToErrno(status)
}
log.LogDebugf("GetObjExtents: ino(%v) gen(%v) size(%v) extents(%v) objextents(%v)", inode, gen, size, extents, objExtents)
return gen, size, extents, objExtents, nil
}
func (mw *MetaWrapper) Truncate(inode, size uint64, fullPath string) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("Truncate: No inode partition, ino(%v)", inode)
return syscall.ENOENT
}
status, err := mw.truncate(mp, inode, size, fullPath)
if err != nil || status != statusOK {
return statusToErrno(status)
}
return nil
}
func (mw *MetaWrapper) Link(parentID uint64, name string, ino uint64, fullPath string) (*proto.InodeInfo, error) {
// if mw.EnableTransaction {
if mw.EnableTransaction&proto.TxOpMaskLink > 0 {
return mw.txLink(parentID, name, ino, fullPath)
} else {
return mw.link(parentID, name, ino, fullPath)
}
}
func (mw *MetaWrapper) txLink(parentID uint64, name string, ino uint64, fullPath string) (info *proto.InodeInfo, err error) {
// var err error
var status int
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("txLink: No parent partition, parentID(%v)", parentID)
return nil, syscall.ENOENT
}
mp := mw.getPartitionByInode(ino)
if mp == nil {
log.LogErrorf("txLink: No target inode partition, ino(%v)", ino)
return nil, syscall.ENOENT
}
var tx *Transaction
defer func() {
if tx != nil {
err = tx.OnDone(err, mw)
}
}()
tx, err = NewLinkTransaction(parentMP, parentID, name, mp, ino, mw.TxTimeout)
if err != nil {
return nil, syscall.EAGAIN
}
status, err = mw.txCreateTX(tx, parentMP)
if status != statusOK || err != nil {
return nil, statusErrToErrno(status, err)
}
funcs := make([]func() (int, error), 0)
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
newSt, info, newErr = mw.txIlink(tx, mp, ino, fullPath)
return newSt, newErr
})
funcs = append(funcs, func() (int, error) {
var newSt int
var newErr error
var quotaIds []uint32
var ifo *proto.InodeInfo
if mw.EnableQuota {
quotaInfos, newErr := mw.getInodeQuota(parentMP, parentID)
if newErr != nil {
log.LogErrorf("link: get parent quota fail, parentID(%v) err(%v)", parentID, newErr)
return statusError, syscall.ENOENT
}
for quotaId := range quotaInfos {
quotaIds = append(quotaIds, quotaId)
}
}
newSt, ifo, newErr = mw.iget(mp, ino, 0)
if newErr != nil || newSt != statusOK {
return newSt, newErr
}
newSt, newErr = mw.txDcreate(tx, parentMP, parentID, name, ino, ifo.Mode, quotaIds, fullPath)
return newSt, newErr
})
// 2. prepare transaction
var preErr error
wg := sync.WaitGroup{}
for _, fc := range funcs {
wg.Add(1)
go func(f func() (int, error)) {
defer wg.Done()
tStatus, tErr := f()
if tStatus != statusOK || tErr != nil {
preErr = statusErrToErrno(tStatus, tErr)
}
}(fc)
}
wg.Wait()
if preErr != nil {
return nil, preErr
}
return info, nil
}
func (mw *MetaWrapper) link(parentID uint64, name string, ino uint64, fullPath string) (*proto.InodeInfo, error) {
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("Link: No parent partition, parentID(%v)", parentID)
return nil, syscall.ENOENT
}
status, info, err := mw.iget(parentMP, parentID, mw.VerReadSeq)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
quota := atomic.LoadUint32(&mw.DirChildrenNumLimit)
if info.Nlink >= quota {
log.LogErrorf("link: parent inode's nlink quota reached, parentID(%v)", parentID)
return nil, syscall.EDQUOT
}
mp := mw.getPartitionByInode(ino)
if mp == nil {
log.LogErrorf("Link: No target inode partition, ino(%v)", ino)
return nil, syscall.ENOENT
}
// increase inode nlink
status, info, err = mw.ilink(mp, ino, fullPath)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
if mw.EnableQuota {
quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
if err != nil {
log.LogErrorf("link: get parent quota fail, parentID(%v) err(%v)", parentID, err)
return nil, syscall.ENOENT
}
var quotaIds []uint32
for quotaId := range quotaInfos {
quotaIds = append(quotaIds, quotaId)
}
// create new dentry and refer to the inode
status, err = mw.quotaDcreate(parentMP, parentID, name, ino, info.Mode, quotaIds, fullPath)
} else {
status, err = mw.dcreate(parentMP, parentID, name, ino, info.Mode, fullPath)
}
if err != nil {
return nil, statusToErrno(status)
} else if status != statusOK {
if status != statusExist {
mw.iunlink(mp, ino, mw.Client.GetLatestVer(), 0, fullPath)
}
return nil, statusToErrno(status)
}
return info, nil
}
func (mw *MetaWrapper) Evict(inode uint64, fullPath string) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogWarnf("Evict: No such partition, ino(%v)", inode)
return syscall.EINVAL
}
status, err := mw.ievict(mp, inode, fullPath)
if err != nil || status != statusOK {
log.LogWarnf("Evict: ino(%v) err(%v) status(%v)", inode, err, status)
return statusToErrno(status)
}
return nil
}
func (mw *MetaWrapper) Setattr(inode uint64, valid, mode, uid, gid uint32, atime, mtime int64) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("Setattr: No such partition, ino(%v)", inode)
return syscall.EINVAL
}
status, err := mw.setattr(mp, inode, valid, mode, uid, gid, atime, mtime)
if err != nil || status != statusOK {
log.LogErrorf("Setattr: ino(%v) err(%v) status(%v)", inode, err, status)
return statusToErrno(status)
}
return nil
}
func (mw *MetaWrapper) InodeCreate_ll(parentID uint64, mode, uid, gid uint32, target []byte, quotaIds []uint64, fullPath string) (*proto.InodeInfo, error) {
var (
status int
err error
info *proto.InodeInfo
mp *MetaPartition
rwPartitions []*MetaPartition
)
get_rwmp:
rwPartitions = mw.getRWPartitions()
length := len(rwPartitions)
epoch := atomic.AddUint64(&mw.epoch, 1)
retryTime := 0
if mw.EnableQuota && parentID != 0 {
var quotaIds []uint32
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
log.LogErrorf("InodeCreate_ll: No parent partition, parentID(%v)", parentID)
return nil, syscall.ENOENT
}
quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
if err != nil {
log.LogErrorf("InodeCreate_ll: get parent quota fail, parentID(%v) err(%v)", parentID, err)
return nil, syscall.ENOENT
}
for quotaId := range quotaInfos {
quotaIds = append(quotaIds, quotaId)
}
for i := 0; i < length; i++ {
index := (int(epoch) + i) % length
mp = rwPartitions[index]
status, info, err = mw.quotaIcreate(mp, mode, uid, gid, target, quotaIds, fullPath)
if err == nil && status == statusOK {
return info, nil
} else if status == statusFull {
if retryTime >= InodeFullMaxRetryTime {
break
}
retryTime++
log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
mw.triggerAndWaitForceUpdate()
return nil, nil
})
goto get_rwmp
} else if status == statusNoSpace {
log.LogErrorf("InodeCreate_ll status %v", status)
return nil, statusToErrno(status)
}
}
} else {
for i := 0; i < length; i++ {
index := (int(epoch) + i) % length
mp = rwPartitions[index]
status, info, err = mw.icreate(mp, mode, uid, gid, target, fullPath)
if err == nil && status == statusOK {
return info, nil
} else if status == statusFull {
if retryTime >= InodeFullMaxRetryTime {
break
}
retryTime++
log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
mw.triggerAndWaitForceUpdate()
return nil, nil
})
goto get_rwmp
} else if status == statusNoSpace {
log.LogErrorf("InodeCreate_ll status %v", status)
return nil, statusToErrno(status)
}
}
}
return nil, syscall.ENOMEM
}
// InodeUnlink_ll is a low-level api that makes specified inode link value +1.
func (mw *MetaWrapper) InodeLink_ll(inode uint64, fullPath string) (*proto.InodeInfo, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeLink_ll: No such partition, ino(%v)", inode)
return nil, syscall.EINVAL
}
status, info, err := mw.ilink(mp, inode, fullPath)
if err != nil || status != statusOK {
log.LogErrorf("InodeLink_ll: ino(%v) err(%v) status(%v)", inode, err, status)
return nil, statusToErrno(status)
}
return info, nil
}
// InodeUnlink_ll is a low-level api that makes specified inode link value -1.
func (mw *MetaWrapper) InodeUnlink_ll(inode uint64, fullPath string) (*proto.InodeInfo, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeUnlink_ll: No such partition, ino(%v)", inode)
return nil, syscall.EINVAL
}
var ver uint64
if mw.Client != nil {
ver = mw.Client.GetLatestVer()
}
status, info, err := mw.iunlink(mp, inode, ver, 0, fullPath)
if err != nil || status != statusOK {
log.LogErrorf("InodeUnlink_ll: ino(%v) err(%v) status(%v)", inode, err, status)
return nil, statusToErrno(status)
}
return info, nil
}
func (mw *MetaWrapper) InodeClearPreloadCache_ll(inode uint64) error {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeClearPreloadCache_ll: No such partition, ino(%v)", inode)
return syscall.EINVAL
}
status, err := mw.iclearCache(mp, inode)
if err != nil || status != statusOK {
log.LogErrorf("InodeClearPreloadCache_ll: ino(%v) err(%v) status(%v)", inode, err, status)
return statusToErrno(status)
}
return nil
}
func (mw *MetaWrapper) InitMultipart_ll(path string, extend map[string]string) (multipartId string, err error) {
var (
status int
mp *MetaPartition
rwPartitions = mw.getRWPartitions()
length = len(rwPartitions)
)
if length <= 0 {
log.LogErrorf("InitMultipart: no writable partitions, path(%v)", path)
return "", syscall.ENOENT
}
epoch := atomic.AddUint64(&mw.epoch, 1)
for i := 0; i < length; i++ {
index := (int(epoch) + i) % length
mp = rwPartitions[index]
log.LogDebugf("InitMultipart_ll: mp(%v), index(%v)", mp, index)
status, sessionId, err := mw.createMultipart(mp, path, extend)
if err == nil && status == statusOK && len(sessionId) > 0 {
return sessionId, nil
} else {
log.LogErrorf("InitMultipart: create multipart id fail, path(%v), mp(%v), status(%v), err(%v)",
path, mp, status, err)
}
}
log.LogErrorf("InitMultipart: create multipart id fail, path(%v), status(%v), err(%v)", path, status, err)
if err != nil {
return "", err
} else {
return "", statusToErrno(status)
}
}
func (mw *MetaWrapper) GetMultipart_ll(path, multipartId string) (info *proto.MultipartInfo, err error) {
var (
mpId uint64
found bool
)
mpId, found = util.MultipartIDFromString(multipartId).PartitionID()
if !found {
log.LogDebugf("AddMultipartPart_ll: meta partition not found by multipart id, multipartId(%v), err(%v)", multipartId, err)
// If meta partition not found by multipart id, broadcast to all meta partitions to find it
info, _, err = mw.broadcastGetMultipart(path, multipartId)
return
}
mp := mw.getPartitionByID(mpId)
if mp == nil {
err = syscall.ENOENT
return
}
status, multipartInfo, err := mw.getMultipart(mp, path, multipartId)
if err != nil || status != statusOK {
log.LogErrorf("GetMultipartRequest: err(%v) status(%v)", err, status)
return nil, statusToErrno(status)
}
return multipartInfo, nil
}
func (mw *MetaWrapper) AddMultipartPart_ll(path, multipartId string, partId uint16, size uint64, md5 string,
inodeInfo *proto.InodeInfo) (oldInode uint64, updated bool, err error) {
var (
mpId uint64
found bool
)
mpId, found = util.MultipartIDFromString(multipartId).PartitionID()
if !found {
log.LogDebugf("AddMultipartPart_ll: meta partition not found by multipart id, multipartId(%v), err(%v)", multipartId, err)
// If meta partition not found by multipart id, broadcast to all meta partitions to find it
if _, mpId, err = mw.broadcastGetMultipart(path, multipartId); err != nil {
log.LogErrorf("AddMultipartPart_ll: broadcast get multipart fail: multipartId(%v) err(%v)", multipartId, err)
return
}
}
mp := mw.getPartitionByID(mpId)
if mp == nil {
log.LogWarnf("AddMultipartPart_ll: has no meta partition: multipartId(%v) mpId(%v)", multipartId, mpId)
err = syscall.ENOENT
return
}
status, oldInode, updated, err := mw.addMultipartPart(mp, path, multipartId, partId, size, md5, inodeInfo)
if err != nil || status != statusOK {
log.LogErrorf("AddMultipartPart_ll: err(%v) status(%v)", err, status)
return 0, false, statusToErrno(status)
}
return
}
func (mw *MetaWrapper) RemoveMultipart_ll(path, multipartID string) (err error) {
var (
mpId uint64
found bool
)
mpId, found = util.MultipartIDFromString(multipartID).PartitionID()
if !found {
log.LogDebugf("AddMultipartPart_ll: meta partition not found by multipart id, multipartId(%v), err(%v)", multipartID, err)
// If meta partition not found by multipart id, broadcast to all meta partitions to find it
if _, mpId, err = mw.broadcastGetMultipart(path, multipartID); err != nil {
return
}
}
mp := mw.getPartitionByID(mpId)
if mp == nil {
err = syscall.ENOENT
return
}
status, err := mw.removeMultipart(mp, path, multipartID)
if err != nil || status != statusOK {
log.LogErrorf(" RemoveMultipart_ll: partition remove multipart fail: "+
"volume(%v) partitionID(%v) multipartID(%v) err(%v) status(%v)",
mw.volname, mp.PartitionID, multipartID, err, status)
return statusToErrno(status)
}
return
}
func (mw *MetaWrapper) broadcastGetMultipart(path, multipartId string) (info *proto.MultipartInfo, mpID uint64, err error) {
log.LogInfof("broadcastGetMultipart: find meta partition broadcast multipartId(%v)", multipartId)
partitions := mw.partitions
var mp *MetaPartition
wg := new(sync.WaitGroup)
var resultMu sync.Mutex
for _, mp = range partitions {
wg.Add(1)
go func(mp *MetaPartition) {
defer wg.Done()
status, multipartInfo, err := mw.getMultipart(mp, path, multipartId)
if err == nil && status == statusOK && multipartInfo != nil && multipartInfo.ID == multipartId {
resultMu.Lock()
mpID = mp.PartitionID
info = multipartInfo
resultMu.Unlock()
}
if err != nil && err != syscall.ENOENT {
log.LogErrorf("broadcastGetMultipart: get multipart fail: partitionId(%v) multipartId(%v)",
mp.PartitionID, multipartId)
}
}(mp)
}
wg.Wait()
resultMu.Lock()
defer resultMu.Unlock()
if info == nil {
err = syscall.ENOENT
return
}
return
}
func (mw *MetaWrapper) ListMultipart_ll(prefix, delimiter, keyMarker string, multipartIdMarker string, maxUploads uint64) (sessionResponse []*proto.MultipartInfo, err error) {
partitions := mw.partitions
wg := sync.WaitGroup{}
wl := sync.Mutex{}
sessions := make([]*proto.MultipartInfo, 0)
for _, mp := range partitions {
wg.Add(1)
go func(mp *MetaPartition) {
defer wg.Done()
status, response, err := mw.listMultiparts(mp, prefix, delimiter, keyMarker, multipartIdMarker, maxUploads+1)
if err != nil || status != statusOK {
log.LogErrorf("ListMultipart: partition list multipart fail, partitionID(%v) err(%v) status(%v)",
mp.PartitionID, err, status)
err = statusToErrno(status)
return
}
wl.Lock()
defer wl.Unlock()
sessions = append(sessions, response.Multiparts...)
}(mp)
}
// combine sessions from per partition
wg.Wait()
// reorder sessions by path
sort.SliceStable(sessions, func(i, j int) bool {
return (sessions[i].Path < sessions[j].Path) || ((sessions[i].Path == sessions[j].Path) && (sessions[i].ID < sessions[j].ID))
})
return sessions, nil
}
func (mw *MetaWrapper) XAttrSet_ll(inode uint64, name, value []byte) error {
var err error
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("XAttrSet_ll: no such partition, inode(%v)", inode)
return syscall.ENOENT
}
var status int
status, err = mw.setXAttr(mp, inode, name, value)
if err != nil || status != statusOK {
return statusToErrno(status)
}
log.LogDebugf("XAttrSet_ll: set xattr: volume(%v) inode(%v) name(%v) value(%v) status(%v)",
mw.volname, inode, name, value, status)
return nil
}
func (mw *MetaWrapper) BatchSetXAttr_ll(inode uint64, attrs map[string]string) error {
var err error
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("XAttrSet_ll: no such partition, inode(%v)", inode)
return syscall.ENOENT
}
var status int
status, err = mw.batchSetXAttr(mp, inode, attrs)
if err != nil || status != statusOK {
return statusToErrno(status)
}
log.LogDebugf("BatchSetXAttr_ll: set xattr: volume(%v) inode(%v) attrs(%v) status(%v)",
mw.volname, inode, attrs, status)
return nil
}
func (mw *MetaWrapper) XAttrGetAll_ll(inode uint64) (*proto.XAttrInfo, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("XAttrGetAll_ll: no such partition, ino(%v)", inode)
return nil, syscall.ENOENT
}
attrs, status, err := mw.getAllXAttr(mp, inode)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
xAttr := &proto.XAttrInfo{
Inode: inode,
XAttrs: attrs,
}
log.LogDebugf("XAttrGetAll_ll: volume(%v) inode(%v) attrs(%v)",
mw.volname, inode, attrs)
return xAttr, nil
}
func (mw *MetaWrapper) XAttrGet_ll(inode uint64, name string) (*proto.XAttrInfo, error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("InodeGet_ll: no such partition, ino(%v)", inode)
return nil, syscall.ENOENT
}
value, status, err := mw.getXAttr(mp, inode, name)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
xAttrValues := make(map[string]string)
xAttrValues[name] = value
xAttr := &proto.XAttrInfo{
Inode: inode,
XAttrs: xAttrValues,
}
log.LogDebugf("XAttrGet_ll: get xattr: volume(%v) inode(%v) name(%v) value(%v)",
mw.volname, inode, name, value)
return xAttr, nil
}
// XAttrDel_ll is a low-level meta api that deletes specified xattr.
func (mw *MetaWrapper) XAttrDel_ll(inode uint64, name string) error {
var err error
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("XAttrDel_ll: no such partition, inode(%v)", inode)
return syscall.ENOENT
}
var status int
status, err = mw.removeXAttr(mp, inode, name)
if err != nil || status != statusOK {
return statusToErrno(status)
}
log.LogDebugf("XAttrDel_ll: remove xattr, inode(%v) name(%v) status(%v)", inode, name, status)
return nil
}
func (mw *MetaWrapper) XAttrsList_ll(inode uint64) ([]string, error) {
var err error
mp := mw.getPartitionByInode(inode)
if mp == nil {
log.LogErrorf("XAttrsList_ll: no such partition, inode(%v)", inode)
return nil, syscall.ENOENT
}
keys, status, err := mw.listXAttr(mp, inode)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
return keys, nil
}
func (mw *MetaWrapper) UpdateSummary_ll(parentIno uint64, filesInc int64, dirsInc int64, bytesInc int64) {
if filesInc == 0 && dirsInc == 0 && bytesInc == 0 {
return
}
mp := mw.getPartitionByInode(parentIno)
if mp == nil {
log.LogErrorf("UpdateSummary_ll: no such partition, inode(%v)", parentIno)
return
}
for cnt := 0; cnt < UpdateSummaryRetry; cnt++ {
err := mw.updateXAttrs(mp, parentIno, filesInc, dirsInc, bytesInc)
if err == nil {
return
}
}
return
}
func (mw *MetaWrapper) ReadDirOnly_ll(parentID uint64) ([]proto.Dentry, error) {
parentMP := mw.getPartitionByInode(parentID)
if parentMP == nil {
return nil, syscall.ENOENT
}
status, children, err := mw.readdironly(parentMP, parentID)
if err != nil || status != statusOK {
return nil, statusToErrno(status)
}
return children, nil
}
type SummaryInfo struct {
Files int64
Subdirs int64
Fbytes int64
}
func (mw *MetaWrapper) GetSummary_ll(parentIno uint64, goroutineNum int32) (SummaryInfo, error) {
if goroutineNum > MaxSummaryGoroutineNum {
goroutineNum = MaxSummaryGoroutineNum
}
if goroutineNum <= 0 {
goroutineNum = 1
}
var summaryInfo SummaryInfo
errCh := make(chan error)
var wg sync.WaitGroup
var currentGoroutineNum int32 = 0
if mw.EnableSummary {
inodeCh := make(chan uint64, ChannelLen)
wg.Add(1)
atomic.AddInt32(¤tGoroutineNum, 1)
inodeCh <- parentIno
go mw.getDentry(parentIno, inodeCh, errCh, &wg, ¤tGoroutineNum, true, goroutineNum)
go func() {
wg.Wait()
close(inodeCh)
}()
go mw.getDirSummary(&summaryInfo, inodeCh, errCh)
for err := range errCh {
return SummaryInfo{0, 0, 0}, err
}
return summaryInfo, nil
} else {
summaryCh := make(chan SummaryInfo, ChannelLen)
wg.Add(1)
atomic.AddInt32(¤tGoroutineNum, 1)
go mw.getSummaryOrigin(parentIno, summaryCh, errCh, &wg, ¤tGoroutineNum, true, goroutineNum)
go func() {
wg.Wait()
close(summaryCh)
}()
go func(summaryInfo *SummaryInfo) {
for summary := range summaryCh {
summaryInfo.Files = summaryInfo.Files + summary.Files
summaryInfo.Subdirs = summaryInfo.Subdirs + summary.Subdirs
summaryInfo.Fbytes = summaryInfo.Fbytes + summary.Fbytes
}
close(errCh)
}(&summaryInfo)
for err := range errCh {
return SummaryInfo{0, 0, 0}, err
}
return summaryInfo, nil
}
}
func (mw *MetaWrapper) getDentry(parentIno uint64, inodeCh chan<- uint64, errCh chan<- error, wg *sync.WaitGroup, currentGoroutineNum *int32, newGoroutine bool, goroutineNum int32) {
defer func() {
if newGoroutine {
atomic.AddInt32(currentGoroutineNum, -1)
wg.Done()
}
}()
entries, err := mw.ReadDirOnly_ll(parentIno)
if err != nil {
errCh <- err
return
}
for _, entry := range entries {
inodeCh <- entry.Inode
if atomic.LoadInt32(currentGoroutineNum) < goroutineNum {
wg.Add(1)
atomic.AddInt32(currentGoroutineNum, 1)
go mw.getDentry(entry.Inode, inodeCh, errCh, wg, currentGoroutineNum, true, goroutineNum)
} else {
mw.getDentry(entry.Inode, inodeCh, errCh, wg, currentGoroutineNum, false, goroutineNum)
}
}
}
func (mw *MetaWrapper) getDirSummary(summaryInfo *SummaryInfo, inodeCh <-chan uint64, errch chan<- error) {
var inodes []uint64
var keys []string
for inode := range inodeCh {
inodes = append(inodes, inode)
keys = append(keys, SummaryKey)
if len(inodes) < BatchSize {
continue
}
xattrInfos, err := mw.BatchGetXAttr(inodes, keys)
if err != nil {
errch <- err
return
}
inodes = inodes[0:0]
keys = keys[0:0]
for _, xattrInfo := range xattrInfos {
if xattrInfo.XAttrs[SummaryKey] != "" {
summaryList := strings.Split(xattrInfo.XAttrs[SummaryKey], ",")
files, _ := strconv.ParseInt(summaryList[0], 10, 64)
subdirs, _ := strconv.ParseInt(summaryList[1], 10, 64)
fbytes, _ := strconv.ParseInt(summaryList[2], 10, 64)
summaryInfo.Files += files
summaryInfo.Subdirs += subdirs
summaryInfo.Fbytes += fbytes
}
}
}
xattrInfos, err := mw.BatchGetXAttr(inodes, keys)
if err != nil {
errch <- err
return
}
for _, xattrInfo := range xattrInfos {
if xattrInfo.XAttrs[SummaryKey] != "" {
summaryList := strings.Split(xattrInfo.XAttrs[SummaryKey], ",")
files, _ := strconv.ParseInt(summaryList[0], 10, 64)
subdirs, _ := strconv.ParseInt(summaryList[1], 10, 64)
fbytes, _ := strconv.ParseInt(summaryList[2], 10, 64)
summaryInfo.Files += files
summaryInfo.Subdirs += subdirs
summaryInfo.Fbytes += fbytes
}
}
close(errch)
return
}
func (mw *MetaWrapper) getSummaryOrigin(parentIno uint64, summaryCh chan<- SummaryInfo, errCh chan<- error, wg *sync.WaitGroup, currentGoroutineNum *int32, newGoroutine bool, goroutineNum int32) {
defer func() {
if newGoroutine {
atomic.AddInt32(currentGoroutineNum, -1)
wg.Done()
}
}()
var subdirsList []uint64
retSummaryInfo := SummaryInfo{
Files: 0,
Subdirs: 0,
Fbytes: 0,
}
children, err := mw.ReadDir_ll(parentIno)
if err != nil {
errCh <- err
return
}
for _, dentry := range children {
if proto.IsDir(dentry.Type) {
retSummaryInfo.Subdirs += 1
subdirsList = append(subdirsList, dentry.Inode)
} else {
fileInfo, err := mw.InodeGet_ll(dentry.Inode)
if err != nil {
errCh <- err
return
}
retSummaryInfo.Files += 1
retSummaryInfo.Fbytes += int64(fileInfo.Size)
}
}
summaryCh <- retSummaryInfo
for _, subdirIno := range subdirsList {
if atomic.LoadInt32(currentGoroutineNum) < goroutineNum {
wg.Add(1)
atomic.AddInt32(currentGoroutineNum, 1)
go mw.getSummaryOrigin(subdirIno, summaryCh, errCh, wg, currentGoroutineNum, true, goroutineNum)
} else {
mw.getSummaryOrigin(subdirIno, summaryCh, errCh, wg, currentGoroutineNum, false, goroutineNum)
}
}
}
func (mw *MetaWrapper) RefreshSummary_ll(parentIno uint64, goroutineNum int32) error {
if goroutineNum > MaxSummaryGoroutineNum {
goroutineNum = MaxSummaryGoroutineNum
}
if goroutineNum <= 0 {
goroutineNum = 1
}
var wg sync.WaitGroup
var currentGoroutineNum int32 = 0
errch := make(chan error)
wg.Add(1)
atomic.AddInt32(¤tGoroutineNum, 1)
go mw.refreshSummary(parentIno, errch, &wg, ¤tGoroutineNum, true, goroutineNum)
go func() {
wg.Wait()
close(errch)
}()
for err := range errch {
return err
}
return nil
}
func (mw *MetaWrapper) refreshSummary(parentIno uint64, errCh chan<- error, wg *sync.WaitGroup, currentGoroutineNum *int32, newGoroutine bool, goroutineNum int32) {
defer func() {
if newGoroutine {
atomic.AddInt32(currentGoroutineNum, -1)
wg.Done()
}
}()
summaryXAttrInfo, err := mw.XAttrGet_ll(parentIno, SummaryKey)
if err != nil {
errCh <- err
return
}
oldSummaryInfo := SummaryInfo{0, 0, 0}
if summaryXAttrInfo.XAttrs[SummaryKey] != "" {
summaryList := strings.Split(summaryXAttrInfo.XAttrs[SummaryKey], ",")
files, _ := strconv.ParseInt(summaryList[0], 10, 64)
subdirs, _ := strconv.ParseInt(summaryList[1], 10, 64)
fbytes, _ := strconv.ParseInt(summaryList[2], 10, 64)
oldSummaryInfo = SummaryInfo{
Files: files,
Subdirs: subdirs,
Fbytes: fbytes,
}
} else {
oldSummaryInfo = SummaryInfo{0, 0, 0}
}
newSummaryInfo := SummaryInfo{0, 0, 0}
var subdirsList []uint64
children, err := mw.ReadDir_ll(parentIno)
if err != nil {
errCh <- err
return
}
for _, dentry := range children {
if proto.IsDir(dentry.Type) {
newSummaryInfo.Subdirs += 1
subdirsList = append(subdirsList, dentry.Inode)
} else {
fileInfo, err := mw.InodeGet_ll(dentry.Inode)
if err != nil {
errCh <- err
return
}
newSummaryInfo.Files += 1
newSummaryInfo.Fbytes += int64(fileInfo.Size)
}
}
go mw.UpdateSummary_ll(
parentIno,
newSummaryInfo.Files-oldSummaryInfo.Files,
newSummaryInfo.Subdirs-oldSummaryInfo.Subdirs,
newSummaryInfo.Fbytes-oldSummaryInfo.Fbytes,
)
for _, subdirIno := range subdirsList {
if atomic.LoadInt32(currentGoroutineNum) < goroutineNum {
wg.Add(1)
atomic.AddInt32(currentGoroutineNum, 1)
go mw.refreshSummary(subdirIno, errCh, wg, currentGoroutineNum, true, goroutineNum)
} else {
mw.refreshSummary(subdirIno, errCh, wg, currentGoroutineNum, false, goroutineNum)
}
}
}
func (mw *MetaWrapper) BatchSetInodeQuota_ll(inodes []uint64, quotaId uint32, IsRoot bool) (ret map[uint64]uint8, err error) {
batchInodeMap := make(map[uint64][]uint64)
ret = make(map[uint64]uint8, 0)
for _, ino := range inodes {
mp := mw.getPartitionByInode(ino)
if mp == nil {
continue
}
if _, isFind := batchInodeMap[mp.PartitionID]; !isFind {
batchInodeMap[mp.PartitionID] = make([]uint64, 0, 128)
}
batchInodeMap[mp.PartitionID] = append(batchInodeMap[mp.PartitionID], ino)
}
for id, inos := range batchInodeMap {
mp := mw.getPartitionByID(id)
resp, err := mw.batchSetInodeQuota(mp, inos, quotaId, IsRoot)
if err != nil {
log.LogErrorf("batchSetInodeQuota quota [%v] inodes [%v] err [%v]", quotaId, inos, err)
return ret, err
}
for k, v := range resp.InodeRes {
ret[k] = v
}
}
log.LogInfof("set subInode quota [%v] inodes [%v] ret [%v] success.", quotaId, inodes, ret)
return
}
func (mw *MetaWrapper) GetPartitionByInodeId_ll(inodeId uint64) (mp *MetaPartition) {
return mw.getPartitionByInode(inodeId)
}
func (mw *MetaWrapper) BatchDeleteInodeQuota_ll(inodes []uint64, quotaId uint32) (ret map[uint64]uint8, err error) {
batchInodeMap := make(map[uint64][]uint64)
ret = make(map[uint64]uint8, 0)
for _, ino := range inodes {
mp := mw.getPartitionByInode(ino)
if mp == nil {
continue
}
if _, isFind := batchInodeMap[mp.PartitionID]; !isFind {
batchInodeMap[mp.PartitionID] = make([]uint64, 0, 128)
}
batchInodeMap[mp.PartitionID] = append(batchInodeMap[mp.PartitionID], ino)
}
for id, inos := range batchInodeMap {
mp := mw.getPartitionByID(id)
resp, err := mw.batchDeleteInodeQuota(mp, inos, quotaId)
if err != nil {
log.LogErrorf("batchDeleteInodeQuota quota [%v] inodes [%v] err [%v]", quotaId, inos, err)
return ret, err
}
for k, v := range resp.InodeRes {
ret[k] = v
}
}
log.LogInfof("delete subInode inodes [%v] quota [%v] ret [%v] success.", inodes, quotaId, ret)
return
}
func (mw *MetaWrapper) GetInodeQuota_ll(inode uint64) (quotaInfos map[uint32]*proto.MetaQuotaInfo, err error) {
mp := mw.getPartitionByInode(inode)
if mp == nil {
err = fmt.Errorf("get partition by inode [%v] failed", inode)
return nil, err
}
quotaInfos, err = mw.getInodeQuota(mp, inode)
if err != nil {
log.LogErrorf("GetInodeQuota_ll get inode [%v] quota failed [%v]", inode, err)
return
}
return
}
func (mw *MetaWrapper) ApplyQuota_ll(parentIno uint64, quotaId uint32, maxConcurrencyInode uint64) (numInodes uint64, err error) {
inodes := make([]uint64, 0, maxConcurrencyInode)
var curInodeCount uint64
err = mw.applyQuota(parentIno, quotaId, &numInodes, &curInodeCount, &inodes, maxConcurrencyInode, true)
return
}
func (mw *MetaWrapper) RevokeQuota_ll(parentIno uint64, quotaId uint32, maxConcurrencyInode uint64) (numInodes uint64, err error) {
inodes := make([]uint64, 0, maxConcurrencyInode)
var curInodeCount uint64
err = mw.revokeQuota(parentIno, quotaId, &numInodes, &curInodeCount, &inodes, maxConcurrencyInode, true)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
"fmt"
"net"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
SendRetryLimit = 200 // times
SendRetryInterval = 100 // ms
)
type MetaConn struct {
conn *net.TCPConn
id uint64 // PartitionID
addr string // MetaNode addr
}
// Connection managements
//
func (mc *MetaConn) String() string {
return fmt.Sprintf("partitionID(%v) addr(%v)", mc.id, mc.addr)
}
func (mw *MetaWrapper) getConn(partitionID uint64, addr string) (*MetaConn, error) {
conn, err := mw.conns.GetConnect(addr)
if err != nil {
return nil, err
}
mc := &MetaConn{conn: conn, id: partitionID, addr: addr}
return mc, nil
}
func (mw *MetaWrapper) putConn(mc *MetaConn, err error) {
mw.conns.PutConnect(mc.conn, err != nil)
}
func (mw *MetaWrapper) sendToMetaPartition(mp *MetaPartition, req *proto.Packet) (*proto.Packet, error) {
var (
resp *proto.Packet
err error
addr string
mc *MetaConn
start time.Time
lastSeq uint64
)
var sendTimeLimit int
if mw.metaSendTimeout < 20 {
sendTimeLimit = 20 * 1000 // ms
} else {
sendTimeLimit = int(mw.metaSendTimeout) * 1000 // ms
}
delta := (sendTimeLimit*2/SendRetryLimit - SendRetryInterval*2) / SendRetryLimit // ms
log.LogDebugf("mw.metaSendTimeout: %v s, sendTimeLimit: %v ms, delta: %v ms, req %v", mw.metaSendTimeout, sendTimeLimit, delta, req)
req.ExtentType |= proto.MultiVersionFlag
errs := make(map[int]error, len(mp.Members))
var j int
addr = mp.LeaderAddr
if addr == "" {
err = errors.New(fmt.Sprintf("sendToMetaPartition: failed due to empty leader addr and goto retry, req(%v) mp(%v)", req, mp))
goto retry
}
mc, err = mw.getConn(mp.PartitionID, addr)
if err != nil {
log.LogWarnf("sendToMetaPartition: getConn failed and goto retry, req(%v) mp(%v) addr(%v) err(%v)", req, mp, addr, err)
goto retry
}
if mw.Client != nil { // compatible lcNode not init Client
lastSeq = mw.Client.GetLatestVer()
}
sendWithList:
resp, err = mc.send(req, lastSeq)
if err == nil && !resp.ShouldRetry() && !resp.ShouldRetryWithVersionList() {
mw.putConn(mc, err)
goto out
}
if resp != nil && resp.ShouldRetryWithVersionList() {
// already send with list, must be a issue happened
if req.ExtentType&proto.VersionListFlag == proto.VersionListFlag {
mw.putConn(mc, err)
goto out
}
req.ExtentType |= proto.VersionListFlag
req.VerList = make([]*proto.VolVersionInfo, len(mw.Client.GetVerMgr().VerList))
copy(req.VerList, mw.Client.GetVerMgr().VerList)
log.LogWarnf("sendToMetaPartition: leader failed and goto retry, req(%v) mp(%v) mc(%v) err(%v) resp(%v)", req, mp, mc, err, resp)
goto sendWithList
}
mw.putConn(mc, err)
retry:
start = time.Now()
for i := 0; i <= SendRetryLimit; i++ {
for j, addr = range mp.Members {
mc, err = mw.getConn(mp.PartitionID, addr)
errs[j] = err
if err != nil {
log.LogWarnf("sendToMetaPartition: getConn failed and continue to retry, req(%v) mp(%v) addr(%v) err(%v)", req, mp, addr, err)
continue
}
resp, err = mc.send(req, lastSeq)
mw.putConn(mc, err)
if err == nil && !resp.ShouldRetry() {
goto out
}
if err == nil {
errs[j] = errors.New(fmt.Sprintf("request should retry[%v]", resp.GetResultMsg()))
} else {
errs[j] = err
}
log.LogWarnf("sendToMetaPartition: retry failed req(%v) mp(%v) mc(%v) errs(%v) resp(%v)", req, mp, mc, errs, resp)
}
if time.Since(start) > time.Duration(sendTimeLimit)*time.Millisecond {
log.LogWarnf("sendToMetaPartition: retry timeout req(%v) mp(%v) time(%v)", req, mp, time.Since(start))
break
}
sendRetryInterval := time.Duration(SendRetryInterval+i*delta) * time.Millisecond
log.LogWarnf("sendToMetaPartition: req(%v) mp(%v) retry in (%v), retry_iteration (%v), retry_totalTime (%v)", req, mp,
sendRetryInterval, i+1, time.Since(start))
time.Sleep(sendRetryInterval)
}
out:
log.LogDebugf("sendToMetaPartition: succeed! req(%v) mc(%v) resp(%v)", req, mc, resp)
if mw.Client != nil && resp != nil { // For compatibility with LcNode, the client checks whether it is nil
mw.checkVerFromMeta(resp)
}
if err != nil || resp == nil {
return nil, errors.New(fmt.Sprintf("sendToMetaPartition failed: req(%v) mp(%v) errs(%v) resp(%v)", req, mp, errs, resp))
}
return resp, nil
}
func (mc *MetaConn) send(req *proto.Packet, verSeq uint64) (resp *proto.Packet, err error) {
req.ExtentType |= proto.MultiVersionFlag
req.VerSeq = verSeq
err = req.WriteToConn(mc.conn)
if err != nil {
return nil, errors.Trace(err, "Failed to write to conn, req(%v)", req)
}
resp = proto.NewPacket()
err = resp.ReadFromConnWithVer(mc.conn, proto.ReadDeadlineTime)
if err != nil {
return nil, errors.Trace(err, "Failed to read from conn, req(%v)", req)
}
// Check if the ID and OpCode of the response are consistent with the request.
if resp.ReqID != req.ReqID || resp.Opcode != req.Opcode {
log.LogErrorf("send: the response packet mismatch with request: conn(%v to %v) req(%v) resp(%v)",
mc.conn.LocalAddr(), mc.conn.RemoteAddr(), req, resp)
return nil, syscall.EBADMSG
}
return resp, nil
}
//go:build gofuzz
// +build gofuzz
// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package meta
import (
fuzz "github.com/AdaLogics/go-fuzz-headers"
)
func FuzzNewMeta(data []byte) int {
f := fuzz.NewConsumer(data)
config := MetaConfig{}
err := f.GenerateStruct(&config)
if err != nil {
return 0
}
_, err = NewMetaWrapper(&config)
if err != nil {
return 0
}
return 1
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
gerrors "errors"
"fmt"
"sync"
"syscall"
"time"
"golang.org/x/sync/singleflight"
"golang.org/x/time/rate"
"github.com/cubefs/cubefs/proto"
authSDK "github.com/cubefs/cubefs/sdk/auth"
"github.com/cubefs/cubefs/sdk/data/wrapper"
masterSDK "github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/auth"
"github.com/cubefs/cubefs/util/btree"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
const (
HostsSeparator = ","
RefreshMetaPartitionsInterval = time.Minute * 5
)
const (
statusUnknown int = iota
statusOK
statusExist
statusNoent
statusFull
statusAgain
statusError
statusInval
statusNotPerm
StatusConflictExtents
statusOpDirQuota
statusNoSpace
statusTxInodeInfoNotExist
statusTxConflict
statusTxTimeout
statusUploadPartConflict
statusNotEmpty
)
const (
MaxMountRetryLimit = 6
MountRetryInterval = time.Second * 5
/*
* Minimum interval of forceUpdateMetaPartitions in seconds,
* i.e. only one force update request is allowed every 5 sec.
*/
MinForceUpdateMetaPartitionsInterval = 5
DefaultQuotaExpiration = 120 * time.Second
MaxQuotaCache = 10000
)
type AsyncTaskErrorFunc func(err error)
func (f AsyncTaskErrorFunc) OnError(err error) {
if f != nil {
f(err)
}
}
type MetaConfig struct {
Volume string
Owner string
Masters []string
Authenticate bool
TicketMess auth.TicketMess
ValidateOwner bool
OnAsyncTaskError AsyncTaskErrorFunc
EnableSummary bool
MetaSendTimeout int64
// EnableTransaction uint8
// EnableTransaction bool
VerReadSeq uint64
}
type MetaWrapper struct {
sync.RWMutex
cluster string
localIP string
volname string
ossSecure *OSSSecure
volCreateTime int64
volDeleteLockTime int64
owner string
ownerValidation bool
mc *masterSDK.MasterClient
ac *authSDK.AuthClient
conns *util.ConnectPool
// Callback handler for handling asynchronous task errors.
onAsyncTaskError AsyncTaskErrorFunc
// Partitions and ranges should be modified together. So do not
// use partitions and ranges directly. Use the helper functions instead.
// Partition map indexed by ID
partitions map[uint64]*MetaPartition
// Partition tree indexed by Start, in order to find a partition in which
// a specific inode locate.
ranges *btree.BTree
rwPartitions []*MetaPartition
epoch uint64
totalSize uint64
usedSize uint64
inodeCount uint64
authenticate bool
Ticket auth.Ticket
accessToken proto.APIAccessReq
sessionKey string
ticketMess auth.TicketMess
closeCh chan struct{}
closeOnce sync.Once
// Allocated to signal the go routines which are waiting for partition view update
partMutex sync.Mutex
partCond *sync.Cond
// Allocated to trigger and throttle instant partition updates
forceUpdate chan struct{}
forceUpdateLimit *rate.Limiter
singleflight singleflight.Group
EnableSummary bool
metaSendTimeout int64
DirChildrenNumLimit uint32
EnableTransaction proto.TxOpMask
TxTimeout int64
TxConflictRetryNum int64
TxConflictRetryInterval int64
EnableQuota bool
QuotaInfoMap map[uint32]*proto.QuotaInfo
QuotaLock sync.RWMutex
// uniqidRange for request dedup
uniqidRangeMap map[uint64]*uniqidRange
uniqidRangeMutex sync.Mutex
qc *QuotaCache
VerReadSeq uint64
LastVerSeq uint64
Client wrapper.SimpleClientInfo
}
type uniqidRange struct {
cur uint64
end uint64
}
// the ticket from authnode
type Ticket struct {
ID string `json:"client_id"`
SessionKey string `json:"session_key"`
ServiceID string `json:"service_id"`
Ticket string `json:"ticket"`
}
func NewMetaWrapper(config *MetaConfig) (*MetaWrapper, error) {
var err error
mw := new(MetaWrapper)
mw.closeCh = make(chan struct{}, 1)
if config.Authenticate {
ticketMess := config.TicketMess
mw.ac = authSDK.NewAuthClient(ticketMess.TicketHosts, ticketMess.EnableHTTPS, ticketMess.CertFile)
ticket, err := mw.ac.API().GetTicket(config.Owner, ticketMess.ClientKey, proto.MasterServiceID)
if err != nil {
return nil, errors.Trace(err, "Get ticket from authnode failed!")
}
mw.authenticate = config.Authenticate
mw.accessToken.Ticket = ticket.Ticket
mw.accessToken.ClientID = config.Owner
mw.accessToken.ServiceID = proto.MasterServiceID
mw.sessionKey = ticket.SessionKey
mw.ticketMess = ticketMess
}
mw.volname = config.Volume
mw.owner = config.Owner
mw.ownerValidation = config.ValidateOwner
mw.mc = masterSDK.NewMasterClient(config.Masters, false)
mw.onAsyncTaskError = config.OnAsyncTaskError
mw.metaSendTimeout = config.MetaSendTimeout
mw.conns = util.NewConnectPool()
mw.partitions = make(map[uint64]*MetaPartition)
mw.ranges = btree.New(32)
mw.rwPartitions = make([]*MetaPartition, 0)
mw.partCond = sync.NewCond(&mw.partMutex)
mw.forceUpdate = make(chan struct{}, 1)
mw.forceUpdateLimit = rate.NewLimiter(1, MinForceUpdateMetaPartitionsInterval)
mw.EnableSummary = config.EnableSummary
mw.DirChildrenNumLimit = proto.DefaultDirChildrenNumLimit
mw.uniqidRangeMap = make(map[uint64]*uniqidRange, 0)
mw.qc = NewQuotaCache(DefaultQuotaExpiration, MaxQuotaCache)
mw.VerReadSeq = config.VerReadSeq
limit := 0
for limit < MaxMountRetryLimit {
// When initializing the volume, if the master explicitly responds that the specified
// volume does not exist, it will not retry.
if err = mw.initMetaWrapper(); err != nil {
log.LogErrorf("NewMetaWrapper: init meta wrapper failed: volume(%v) err(%v)", mw.volname, err)
if gerrors.Is(err, proto.ErrVolAuthKeyNotMatch) || gerrors.Is(err, proto.ErrVolNotExists) {
break
}
limit++
time.Sleep(MountRetryInterval * time.Duration(limit))
continue
}
break
}
if err != nil {
return nil, err
}
go mw.updateQuotaInfoTick()
go mw.refresh()
return mw, nil
}
func (mw *MetaWrapper) initMetaWrapper() (err error) {
if err = mw.updateClusterInfo(); err != nil {
return err
}
if err = mw.updateVolStatInfo(); err != nil {
return err
}
if err = mw.updateMetaPartitions(); err != nil {
return err
}
if err = mw.updateDirChildrenNumLimit(); err != nil {
return err
}
return nil
}
func (mw *MetaWrapper) Owner() string {
return mw.owner
}
func (mw *MetaWrapper) enableTx(mask proto.TxOpMask) bool {
return mw.EnableTransaction != proto.TxPause && mw.EnableTransaction&mask > 0
}
func (mw *MetaWrapper) OSSSecure() (accessKey, secretKey string) {
return mw.ossSecure.AccessKey, mw.ossSecure.SecretKey
}
func (mw *MetaWrapper) VolCreateTime() int64 {
return mw.volCreateTime
}
func (mw *MetaWrapper) Close() error {
mw.closeOnce.Do(func() {
close(mw.closeCh)
mw.conns.Close()
})
return nil
}
func (mw *MetaWrapper) Cluster() string {
return mw.cluster
}
func (mw *MetaWrapper) LocalIP() string {
return mw.localIP
}
func (mw *MetaWrapper) exporterKey(act string) string {
return fmt.Sprintf("%s_sdk_meta_%s", mw.cluster, act)
}
// Proto ResultCode to status
func parseStatus(result uint8) (status int) {
switch result {
case proto.OpOk:
status = statusOK
case proto.OpExistErr:
status = statusExist
case proto.OpNotExistErr:
status = statusNoent
case proto.OpInodeFullErr:
status = statusFull
case proto.OpAgain:
status = statusAgain
case proto.OpArgMismatchErr:
status = statusInval
case proto.OpNotPerm:
status = statusNotPerm
case proto.OpConflictExtentsErr:
status = StatusConflictExtents
case proto.OpDirQuota:
status = statusOpDirQuota
case proto.OpNotEmpty:
status = statusNotEmpty
case proto.OpNoSpaceErr:
status = statusNoSpace
case proto.OpTxInodeInfoNotExistErr:
status = statusTxInodeInfoNotExist
case proto.OpTxConflictErr:
status = statusTxConflict
case proto.OpTxTimeoutErr:
status = statusTxTimeout
case proto.OpUploadPartConflictErr:
status = statusUploadPartConflict
default:
status = statusError
}
return
}
func statusErrToErrno(status int, err error) error {
if status == statusOK && err != nil {
return syscall.EAGAIN
}
return statusToErrno(status)
}
func statusToErrno(status int) error {
switch status {
case statusOK:
// return error anyway
return syscall.EAGAIN
case statusExist:
return syscall.EEXIST
case statusNotEmpty:
return syscall.ENOTEMPTY
case statusNoent:
return syscall.ENOENT
case statusFull:
return syscall.ENOMEM
case statusAgain:
return syscall.EAGAIN
case statusInval:
return syscall.EINVAL
case statusNotPerm:
return syscall.EPERM
case statusError:
return syscall.EAGAIN
case StatusConflictExtents:
return syscall.ENOTSUP
case statusOpDirQuota:
return syscall.EDQUOT
case statusNoSpace:
return syscall.ENOSPC
case statusTxInodeInfoNotExist:
return syscall.EAGAIN
case statusTxConflict:
return syscall.EAGAIN
case statusTxTimeout:
return syscall.EAGAIN
case statusUploadPartConflict:
return syscall.EEXIST
default:
}
return syscall.EIO
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
"fmt"
"strconv"
"sync"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
// API implementations
//
// txIcreate create inode and tx together
func (mw *MetaWrapper) txIcreate(tx *Transaction, mp *MetaPartition, mode, uid, gid uint32,
target []byte, quotaIds []uint32, fullPath string) (status int, info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txIcreate", err, bgTime, 1)
}()
tx.SetTmID(mp.PartitionID)
req := &proto.TxCreateInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Mode: mode,
Uid: uid,
Gid: gid,
Target: target,
QuotaIds: quotaIds,
TxInfo: tx.txInfo,
}
req.FullPaths = []string{fullPath}
resp := new(proto.TxCreateInodeResponse)
defer func() {
tx.OnExecuted(status, resp.TxInfo)
}()
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaTxCreateInode
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("txIcreate: err(%v)", err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("txIcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
// set tx error msg
err = errors.New(packet.GetResultMsg())
log.LogErrorf("txIcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("txIcreate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
if resp.Info == nil || resp.TxInfo == nil {
err = errors.New(fmt.Sprintf("txIcreate: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
log.LogWarn(err)
return
}
tx.Started = true
tx.txInfo = resp.TxInfo
log.LogDebugf("txIcreate: packet(%v) mp(%v) req(%v) info(%v) tx(%v)", packet, mp, *req, resp.Info, resp.TxInfo)
return status, resp.Info, nil
}
func (mw *MetaWrapper) quotaIcreate(mp *MetaPartition, mode, uid, gid uint32, target []byte, quotaIds []uint32, fullPath string) (status int,
info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("icreate", err, bgTime, 1)
}()
req := &proto.QuotaCreateInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Mode: mode,
Uid: uid,
Gid: gid,
Target: target,
QuotaIds: quotaIds,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpQuotaCreateInode
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("quotaIcreate: err(%v)", err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("quotaIcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("quotaIcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.CreateInodeResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("quotaIcreate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
if resp.Info == nil {
err = errors.New(fmt.Sprintf("quotaIcreate: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
log.LogWarn(err)
return
}
log.LogDebugf("quotaIcreate: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) icreate(mp *MetaPartition, mode, uid, gid uint32, target []byte, fullPath string) (status int,
info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("icreate", err, bgTime, 1)
}()
req := &proto.CreateInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Mode: mode,
Uid: uid,
Gid: gid,
Target: target,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaCreateInode
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("icreate: err(%v)", err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("icreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("icreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.CreateInodeResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("icreate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
if resp.Info == nil {
err = errors.New(fmt.Sprintf("icreate: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
log.LogWarn(err)
return
}
log.LogDebugf("icreate: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) sendToMetaPartitionWithTx(mp *MetaPartition, req *proto.Packet) (packet *proto.Packet, err error) {
retryNum := int64(0)
for {
packet, err = mw.sendToMetaPartition(mp, req)
if err != nil {
log.LogErrorf("sendToMetaPartitionWithTx: packet(%v) mp(%v) reqType(%v) err(%v)",
string(req.Data), mp, req.GetOpMsg(), err)
return
}
if packet.ResultCode != proto.OpTxConflictErr {
break
}
log.LogWarnf("sendToMetaPartitionWithTx: packet(%v) mp(%v) reqType(%v) result(%v), tx conflict retry: %v req(%v)",
packet, mp, packet.GetOpMsg(), packet.GetResultMsg(), retryNum, string(req.Data))
retryNum++
if retryNum > mw.TxConflictRetryNum {
log.LogErrorf("sendToMetaPartitionWithTx: packet(%v) mp(%v) reqType(%v) result(%v), tx conflict retry: %v req(%v)",
packet, mp, packet.GetOpMsg(), packet.GetResultMsg(), retryNum, string(req.Data))
break
}
time.Sleep(time.Duration(mw.TxConflictRetryInterval) * time.Millisecond)
}
return
}
func (mw *MetaWrapper) SendTxPack(req proto.TxPack, resp interface{}, Opcode uint8, mp *MetaPartition,
checkStatusFunc func(int, *proto.Packet) error) (status int, err error, packet *proto.Packet) {
packet = proto.NewPacketReqID()
packet.Opcode = Opcode
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("SendTxPack reqType(%v) txInfo(%v) : err(%v)", packet.GetOpMsg(), req.GetInfo(), err)
return
}
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("SendTxPack: packet(%v) mp(%v) txInfo(%v) err(%v)",
packet, mp, req.GetInfo(), err)
return
}
status = parseStatus(packet.ResultCode)
if checkStatusFunc != nil {
if err = checkStatusFunc(status, packet); err != nil {
log.LogErrorf("SendTxPack: packet(%v) mp(%v) req(%v) txInfo(%v) result(%v) err(%v)",
packet, mp, packet.GetOpMsg(), req.GetInfo(), packet.GetResultMsg(), err)
return
}
} else if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("SendTxPack: packet(%v) mp(%v) req(%v) txInfo(%v) result(%v)",
packet, mp, packet.GetOpMsg(), req.GetInfo(), packet.GetResultMsg())
return
}
if resp == nil {
return
}
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("SendTxPack: packet(%v) mp(%v) txInfo(%v) err(%v) PacketData(%v)",
packet, mp, req.GetInfo(), err, string(packet.Data))
return
}
return
}
func (mw *MetaWrapper) txIunlink(tx *Transaction, mp *MetaPartition, inode uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txIunlink", err, bgTime, 1)
}()
req := &proto.TxUnlinkInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
TxInfo: tx.txInfo,
}
req.FullPaths = []string{fullPath}
resp := new(proto.TxUnlinkInodeResponse)
metric := exporter.NewTPCnt("OpMetaTxUnlinkInode")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
var packet *proto.Packet
if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxUnlinkInode, mp, nil); err != nil {
log.LogErrorf("txIunlink: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("txIunlink: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) iunlink(mp *MetaPartition, inode uint64, verSeq uint64, denVerSeq uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("iunlink", err, bgTime, 1)
}()
// use uniq id to dedup request
status, uniqID, err := mw.consumeUniqID(mp)
if err != nil || status != statusOK {
err = statusToErrno(status)
return
}
req := &proto.UnlinkInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
UniqID: uniqID,
VerSeq: verSeq,
DenVerSeq: denVerSeq,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaUnlinkInode
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("iunlink: ino(%v) err(%v)", inode, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("iunlink: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("iunlink: packet(%v) mp(%v) req(%v) result(%v) status(%v)", packet, mp, *req, packet.GetResultMsg(), status)
return
}
resp := new(proto.UnlinkInodeResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("iunlink: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
log.LogDebugf("iunlink: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) iclearCache(mp *MetaPartition, inode uint64) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("iclearCache", err, bgTime, 1)
}()
req := &proto.ClearInodeCacheRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaClearInodeCache
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("iclearCache: ino(%v) err(%v)", inode, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("iclearCache: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("iclearCache: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("iclearCache: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return status, nil
}
func (mw *MetaWrapper) ievict(mp *MetaPartition, inode uint64, fullPath string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ievict", err, bgTime, 1)
}()
req := &proto.EvictInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaEvictInode
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogWarnf("ievict: ino(%v) err(%v)", inode, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogWarnf("ievict: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogWarnf("ievict: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("ievict exit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, nil
}
func (mw *MetaWrapper) txDcreate(tx *Transaction, mp *MetaPartition, parentID uint64, name string, inode uint64, mode uint32, quotaIds []uint32, fullPath string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txDcreate", err, bgTime, 1)
}()
if parentID == inode {
return statusExist, nil
}
req := &proto.TxCreateDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Inode: inode,
Name: name,
Mode: mode,
QuotaIds: quotaIds,
TxInfo: tx.txInfo,
}
req.FullPaths = []string{fullPath}
metric := exporter.NewTPCnt("OpMetaTxCreateDentry")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
//statusCheckFunc := func(status int, packet *proto.Packet) (err error) {
// if (status != statusOK) && (status != statusExist) {
// err = errors.New(packet.GetResultMsg())
// log.LogErrorf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
// return
// } else if status == statusExist {
// log.LogWarnf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
// }
// return
//}
var packet *proto.Packet
if status, err, packet = mw.SendTxPack(req, nil, proto.OpMetaTxCreateDentry, mp, nil); err != nil {
log.LogErrorf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) quotaDcreate(mp *MetaPartition, parentID uint64, name string, inode uint64, mode uint32,
quotaIds []uint32, fullPath string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("dcreate", err, bgTime, 1)
}()
if parentID == inode {
return statusExist, nil
}
req := &proto.QuotaCreateDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Inode: inode,
Name: name,
Mode: mode,
QuotaIds: quotaIds,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpQuotaCreateDentry
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("quotaDcreate: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("quotaDcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if (status != statusOK) && (status != statusExist) {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("quotaDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
} else if status == statusExist {
log.LogWarnf("quotaDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
}
log.LogDebugf("quotaDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) dcreate(mp *MetaPartition, parentID uint64, name string, inode uint64, mode uint32, fullPath string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("dcreate", err, bgTime, 1)
}()
if parentID == inode {
return statusExist, nil
}
req := &proto.CreateDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Inode: inode,
Name: name,
Mode: mode,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaCreateDentry
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("dcreate: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("dcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if (status != statusOK) && (status != statusExist) {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("dcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
} else if status == statusExist {
log.LogWarnf("dcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
}
log.LogDebugf("dcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) txDupdate(tx *Transaction, mp *MetaPartition, parentID uint64, name string, newInode, oldIno uint64, fullPath string) (status int, oldInode uint64, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txDupdate", err, bgTime, 1)
}()
if parentID == newInode {
return statusExist, 0, nil
}
req := &proto.TxUpdateDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Name: name,
Inode: newInode,
OldIno: oldIno,
TxInfo: tx.txInfo,
}
req.FullPaths = []string{fullPath}
resp := new(proto.TxUpdateDentryResponse)
metric := exporter.NewTPCnt("OpMetaTxUpdateDentry")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
var packet *proto.Packet
if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxUpdateDentry, mp, nil); err != nil {
log.LogErrorf("txDupdate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("txDupdate: packet(%v) mp(%v) req(%v) oldIno(%v)", packet, mp, *req, resp.Inode)
return statusOK, resp.Inode, nil
}
func (mw *MetaWrapper) dupdate(mp *MetaPartition, parentID uint64, name string, newInode uint64, fullPath string) (status int, oldInode uint64, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("dupdate", err, bgTime, 1)
}()
if parentID == newInode {
return statusExist, 0, nil
}
req := &proto.UpdateDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Name: name,
Inode: newInode,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaUpdateDentry
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("dupdate: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("dupdate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("dupdate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.UpdateDentryResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("dupdate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("dupdate: packet(%v) mp(%v) req(%v) oldIno(%v)", packet, mp, *req, resp.Inode)
return statusOK, resp.Inode, nil
}
func (mw *MetaWrapper) txCreateTX(tx *Transaction, mp *MetaPartition) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txCreateTX", err, bgTime, 1)
}()
tx.SetTmID(mp.PartitionID)
req := &proto.TxCreateRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
TransactionInfo: tx.txInfo,
}
resp := new(proto.TxCreateResponse)
metric := exporter.NewTPCnt("OpMetaTxCreate")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
var packet *proto.Packet
if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxCreate, mp, nil); err != nil {
log.LogErrorf("txCreateTX: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
if resp.TxInfo == nil {
err = fmt.Errorf("txCreateTX: create tx resp nil")
log.LogError(err)
return statusError, err
}
if log.EnableDebug() {
log.LogDebugf("txCreateTX: packet(%v) mp(%v) req(%v)", packet, mp, *req)
}
tx.txInfo = resp.TxInfo
tx.Started = true
return statusOK, nil
}
//func (mw *MetaWrapper) txPreCommit(tx *Transaction, mp *MetaPartition) (status int, err error) {
// bgTime := stat.BeginStat()
// defer func() {
// stat.EndStat("txPreCommit", err, bgTime, 1)
// }()
//
// tx.txInfo.TmID = int64(mp.PartitionID)
// req := &proto.TxPreCommitRequest{
// VolName: mw.volname,
// PartitionID: mp.PartitionID,
// TransactionInfo: tx.txInfo,
// }
//
// metric := exporter.NewTPCnt("OpTxPreCommit")
// defer func() {
// metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
// }()
//
// var packet *proto.Packet
// if status, err, packet = mw.SendTxPack(req, nil, proto.OpTxPreCommit, mp, nil); err != nil {
// log.LogErrorf("txPreCommit: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
// return
// }
//
// if log.EnableDebug() {
// log.LogDebugf("txPreCommit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
// }
//
// return statusOK, nil
//}
func (mw *MetaWrapper) txDdelete(tx *Transaction, mp *MetaPartition, parentID, ino uint64, name string, fullPath string) (status int, inode uint64, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txDdelete", err, bgTime, 1)
}()
req := &proto.TxDeleteDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Name: name,
Ino: ino,
TxInfo: tx.txInfo,
}
req.FullPaths = []string{fullPath}
resp := new(proto.TxDeleteDentryResponse)
metric := exporter.NewTPCnt("OpMetaTxDeleteDentry")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
var packet *proto.Packet
if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxDeleteDentry, mp, nil); err != nil {
log.LogErrorf("txDdelete: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("txDdelete: packet(%v) mp(%v) req(%v) ino(%v)", packet, mp, *req, resp.Inode)
return statusOK, resp.Inode, nil
}
func (mw *MetaWrapper) ddelete(mp *MetaPartition, parentID uint64, name string, inodeCreateTime int64, verSeq uint64, fullPath string) (status int, inode uint64, denVer uint64, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ddelete", err, bgTime, 1)
}()
req := &proto.DeleteDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Name: name,
InodeCreateTime: inodeCreateTime,
Verseq: verSeq,
}
req.FullPaths = []string{fullPath}
log.LogDebugf("action[ddelete] %v", req)
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaDeleteDentry
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("ddelete: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("ddelete: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("ddelete: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.DeleteDentryResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("ddelete: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("ddelete: packet(%v) mp(%v) req(%v) ino(%v)", packet, mp, *req, resp.Inode)
return statusOK, resp.Inode, packet.VerSeq, nil
}
func (mw *MetaWrapper) canDeleteInode(mp *MetaPartition, info *proto.InodeInfo, ino uint64) (can bool, err error) {
createTime := info.CreateTime.Unix()
deleteLockTime := mw.volDeleteLockTime * 60 * 60
if deleteLockTime > 0 && createTime+deleteLockTime > time.Now().Unix() {
err = errors.NewErrorf("the current Inode[%v] is still locked for deletion", ino)
log.LogWarnf("canDeleteInode: mp(%v) ino(%v) err(%v)", mp, ino, err)
return false, syscall.EPERM
}
return true, nil
}
func (mw *MetaWrapper) ddeletes(mp *MetaPartition, parentID uint64, dentries []proto.Dentry, fullPaths []string) (status int,
resp *proto.BatchDeleteDentryResponse, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ddeletes", err, bgTime, 1)
}()
req := &proto.BatchDeleteDentryRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Dens: dentries,
FullPaths: fullPaths,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchDeleteDentry
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("ddeletes: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("ddeletes: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status == statusAgain {
err = errors.New("conflict request")
log.LogErrorf("ddeletes: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("ddeletes: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp = new(proto.BatchDeleteDentryResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("ddeletes: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("ddeletes: packet(%v) mp(%v) req(%v) (%v)", packet, mp, *req, resp.Items)
return statusOK, resp, nil
}
func (mw *MetaWrapper) lookup(mp *MetaPartition, parentID uint64, name string, verSeq uint64) (status int, inode uint64, mode uint32, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("lookup", err, bgTime, 1)
}()
req := &proto.LookupRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Name: name,
VerSeq: verSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaLookup
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("lookup: err(%v)", err)
return
}
log.LogDebugf("lookup enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("lookup: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
errMetric := exporter.NewCounter("fileOpenFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: mw.volname, exporter.Err: "EIO"})
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
if status != statusNoent {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("lookup: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
errMetric := exporter.NewCounter("fileOpenFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: mw.volname, exporter.Err: "EIO"})
} else {
log.LogDebugf("lookup exit: packet(%v) mp(%v) req(%v) NoEntry", packet, mp, *req)
}
return
}
resp := new(proto.LookupResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("lookup: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
errMetric := exporter.NewCounter("fileOpenFailed")
errMetric.AddWithLabels(1, map[string]string{exporter.Vol: mw.volname, exporter.Err: "EIO"})
return
}
log.LogDebugf("lookup exit: packet(%v) mp(%v) req(%v) ino(%v) mode(%v)", packet, mp, *req, resp.Inode, resp.Mode)
return statusOK, resp.Inode, resp.Mode, nil
}
func (mw *MetaWrapper) iget(mp *MetaPartition, inode uint64, verSeq uint64) (status int, info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("iget", err, bgTime, 1)
}()
req := &proto.InodeGetRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
VerSeq: verSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaInodeGet
packet.PartitionID = mp.PartitionID
log.LogDebugf("action[iget] pack mp id %v, req %v", mp.PartitionID, req)
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("iget: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("iget: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("iget: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.InodeGetResponse)
err = packet.UnmarshalData(resp)
if err != nil || resp.Info == nil {
log.LogErrorf("iget: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) batchIget(wg *sync.WaitGroup, mp *MetaPartition, inodes []uint64, respCh chan []*proto.InodeInfo) {
defer wg.Done()
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("batchIget", err, bgTime, 1)
}()
req := &proto.BatchInodeGetRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inodes: inodes,
VerSeq: mw.VerReadSeq,
}
log.LogDebugf("action[batchIget] req %v", req)
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchInodeGet
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batchIget: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batchIget: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.BatchInodeGetResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("batchIget: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("action[batchIget] resp %v", resp)
if len(resp.Infos) == 0 {
return
}
select {
case respCh <- resp.Infos:
default:
}
}
func (mw *MetaWrapper) readDir(mp *MetaPartition, parentID uint64) (status int, children []proto.Dentry, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("readDir", err, bgTime, 1)
}()
req := &proto.ReadDirRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaReadDir
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("readDir: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
children = make([]proto.Dentry, 0)
log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.ReadDirResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("readDir: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("readDir: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, resp.Children, nil
}
// read limit dentries start from
func (mw *MetaWrapper) readDirLimit(mp *MetaPartition, parentID uint64, from string, limit uint64, verSeq uint64, verOpt uint8) (status int, children []proto.Dentry, err error) {
req := &proto.ReadDirLimitRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
Marker: from,
Limit: limit,
VerSeq: verSeq,
VerOpt: verOpt,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaReadDirLimit
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("readDirLimit: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("action[readDirLimit] mp [%v] parentId %v", mp.PartitionID, parentID)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("readDirLimit: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
children = make([]proto.Dentry, 0)
log.LogErrorf("readDirLimit: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.ReadDirLimitResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("readDirLimit: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("readDirLimit: packet(%v) mp(%v) req(%v) rsp(%v)", packet, mp, *req, resp.Children)
return statusOK, resp.Children, nil
}
func (mw *MetaWrapper) appendExtentKey(mp *MetaPartition, inode uint64, extent proto.ExtentKey, discard []proto.ExtentKey, isSplit bool) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("appendExtentKey", err, bgTime, 1)
}()
req := &proto.AppendExtentKeyWithCheckRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
Extent: extent,
DiscardExtents: discard,
IsSplit: isSplit,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaExtentAddWithCheck
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("appendExtentKey: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("appendExtentKey: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
if status != StatusConflictExtents {
log.LogErrorf("appendExtentKey: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
}
}
return status, err
}
func (mw *MetaWrapper) getExtents(mp *MetaPartition, inode uint64) (resp *proto.GetExtentsResponse, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getExtents", err, bgTime, 1)
}()
req := &proto.GetExtentsRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaExtentsList
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("getExtents: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getExtents: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
resp = &proto.GetExtentsResponse{}
resp.Status = parseStatus(packet.ResultCode)
if resp.Status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("getExtents: packet(%v) mp(%v) result(%v)", packet, mp, packet.GetResultMsg())
return
}
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("getExtents: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
return resp, nil
}
func (mw *MetaWrapper) getObjExtents(mp *MetaPartition, inode uint64) (status int, gen, size uint64, extents []proto.ExtentKey, objExtents []proto.ObjExtentKey, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getObjExtents", err, bgTime, 1)
}()
req := &proto.GetExtentsRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaObjExtentsList
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("getObjExtents: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getObjExtents: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
extents = make([]proto.ExtentKey, 0)
log.LogErrorf("getObjExtents: packet(%v) mp(%v) result(%v)", packet, mp, packet.GetResultMsg())
return
}
resp := new(proto.GetObjExtentsResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("getObjExtents: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
return statusOK, resp.Generation, resp.Size, resp.Extents, resp.ObjExtents, nil
}
// func (mw *MetaWrapper) delExtentKey(mp *MetaPartition, inode uint64, extents []proto.ExtentKey) (status int, err error) {
// req := &proto.DelExtentKeyRequest{
// VolName: mw.volname,
// PartitionID: mp.PartitionID,
// Inode: inode,
// Extents: extents,
// }
// packet := proto.NewPacketReqID()
// packet.Opcode = proto.OpMetaExtentsDel
// packet.PartitionID = mp.PartitionID
// err = packet.MarshalData(req)
// if err != nil {
// log.LogErrorf("delExtentKey: req(%v) err(%v)", *req, err)
// return
// }
// metric := exporter.NewTPCnt(packet.GetOpMsg())
// defer func() {
// metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
// }()
// packet, err = mw.sendToMetaPartition(mp, packet)
// if err != nil {
// log.LogErrorf("delExtentKey: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
// return
// }
// status = parseStatus(packet.ResultCode)
// if status != statusOK {
// log.LogErrorf("delExtentKey: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
// }
// return status, nil
// }
func (mw *MetaWrapper) truncate(mp *MetaPartition, inode, size uint64, fullPath string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("truncate", err, bgTime, 1)
}()
req := &proto.TruncateRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
Size: size,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaTruncate
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("truncate: ino(%v) size(%v) err(%v)", inode, size, err)
return
}
log.LogDebugf("truncate enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("truncate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("truncate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("truncate exit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, nil
}
func (mw *MetaWrapper) txIlink(tx *Transaction, mp *MetaPartition, inode uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txIlink", err, bgTime, 1)
}()
req := &proto.TxLinkInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
TxInfo: tx.txInfo,
}
req.FullPaths = []string{fullPath}
resp := new(proto.TxLinkInodeResponse)
metric := exporter.NewTPCnt("OpMetaTxLinkInode")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
var packet *proto.Packet
if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxLinkInode, mp, nil); err != nil {
log.LogErrorf("txIlink: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
if log.EnableDebug() {
log.LogDebugf("txIlink exit: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
}
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) ilink(mp *MetaPartition, inode uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
return mw.ilinkWork(mp, inode, proto.OpMetaLinkInode, fullPath)
}
func (mw *MetaWrapper) ilinkWork(mp *MetaPartition, inode uint64, op uint8, fullPath string) (status int, info *proto.InodeInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("ilink", err, bgTime, 1)
}()
// use unique id to dedup request
status, uniqID, err := mw.consumeUniqID(mp)
if err != nil || status != statusOK {
err = statusToErrno(status)
return
}
req := &proto.LinkInodeRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
UniqID: uniqID,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = op
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("ilink: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("ilink enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("ilink: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("ilink: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.LinkInodeResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("ilink: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
if resp.Info == nil {
err = errors.New(fmt.Sprintf("ilink: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
log.LogWarn(err)
return
}
log.LogDebugf("ilink exit: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) setattr(mp *MetaPartition, inode uint64, valid, mode, uid, gid uint32, atime, mtime int64) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("setattr", err, bgTime, 1)
}()
req := &proto.SetAttrRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
Valid: valid,
Mode: mode,
Uid: uid,
Gid: gid,
AccessTime: atime,
ModifyTime: mtime,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaSetattr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("setattr: err(%v)", err)
return
}
log.LogDebugf("setattr enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("setattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("setattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("setattr exit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, nil
}
func (mw *MetaWrapper) createMultipart(mp *MetaPartition, path string, extend map[string]string) (status int, multipartId string, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("createMultipart", err, bgTime, 1)
}()
req := &proto.CreateMultipartRequest{
PartitionId: mp.PartitionID,
VolName: mw.volname,
Path: path,
Extend: extend,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpCreateMultipart
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("createMultipart: err(%v)", err)
return
}
log.LogDebugf("createMultipart enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("createMultipart: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("createMultipart: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.CreateMultipartResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("createMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
return statusOK, resp.Info.ID, nil
}
func (mw *MetaWrapper) getExpiredMultipart(prefix string, days int, mp *MetaPartition) (status int, Infos []*proto.ExpiredMultipartInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getExpiredMultipart", err, bgTime, 1)
}()
req := &proto.GetExpiredMultipartRequest{
PartitionId: mp.PartitionID,
VolName: mw.volname,
Prefix: prefix,
Days: days,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpGetExpiredMultipart
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("get session: err(%v)", err)
return
}
log.LogDebugf("getExpiredMultipart enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getExpiredMultipart: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("getExpiredMultipart: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.GetExpiredMultipartResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("getExpiredMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
return statusOK, resp.Infos, nil
}
func (mw *MetaWrapper) getMultipart(mp *MetaPartition, path, multipartId string) (status int, info *proto.MultipartInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getMultipart", err, bgTime, 1)
}()
req := &proto.GetMultipartRequest{
PartitionId: mp.PartitionID,
VolName: mw.volname,
Path: path,
MultipartId: multipartId,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpGetMultipart
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("get session: err(%v)", err)
return
}
log.LogDebugf("getMultipart enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getMultipart: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("getMultipart: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.GetMultipartResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("getMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
return statusOK, resp.Info, nil
}
func (mw *MetaWrapper) addMultipartPart(mp *MetaPartition, path, multipartId string, partId uint16, size uint64, md5 string, inodeInfo *proto.InodeInfo) (status int, oldNode uint64, updated bool, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("addMultipartPart", err, bgTime, 1)
}()
part := &proto.MultipartPartInfo{
ID: partId,
Inode: inodeInfo.Inode,
MD5: md5,
Size: size,
UploadTime: time.Now(),
}
req := &proto.AddMultipartPartRequest{
PartitionId: mp.PartitionID,
VolName: mw.volname,
Path: path,
MultipartId: multipartId,
Part: part,
}
log.LogDebugf("addMultipartPart: part(%v), req(%v)", part, req)
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpAddMultipartPart
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("addMultipartPart: marshal packet fail, err(%v)", err)
return
}
log.LogDebugf("addMultipartPart entry: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("addMultipartPart: packet(%v) mp(%v) req(%v) part(%v) err(%v)", packet, mp, req, part, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("addMultipartPart: packet(%v) mp(%v) req(%v) part(%v) result(%v)", packet, mp, *req, part, packet.GetResultMsg())
return
}
resp := new(proto.AppendMultipartResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("appendMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
return status, resp.OldInode, resp.Update, nil
}
func (mw *MetaWrapper) idelete(mp *MetaPartition, inode uint64, fullPath string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("idelete", err, bgTime, 1)
}()
req := &proto.DeleteInodeRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
}
req.FullPaths = []string{fullPath}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaDeleteInode
packet.PartitionID = mp.PartitionID
if err = packet.MarshalData(req); err != nil {
log.LogErrorf("delete inode: err[%v]", err)
return
}
log.LogDebugf("delete inode: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
if err != nil {
log.LogErrorf("delete inode: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("idelete: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("idelete: packet(%v) mp(%v) req(%v) ino(%v)", packet, mp, *req, inode)
return statusOK, nil
}
func (mw *MetaWrapper) removeMultipart(mp *MetaPartition, path, multipartId string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("removeMultipart", err, bgTime, 1)
}()
req := &proto.RemoveMultipartRequest{
PartitionId: mp.PartitionID,
VolName: mw.volname,
Path: path,
MultipartId: multipartId,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpRemoveMultipart
packet.PartitionID = mp.PartitionID
if err = packet.MarshalData(req); err != nil {
log.LogErrorf("delete session: err[%v]", err)
return
}
log.LogDebugf("delete session: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("delete session: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("delete session: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("delete session: packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, packet.Data)
return statusOK, nil
}
func (mw *MetaWrapper) appendExtentKeys(mp *MetaPartition, inode uint64, extents []proto.ExtentKey) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("appendExtentKeys", err, bgTime, 1)
}()
req := &proto.AppendExtentKeysRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
Extents: extents,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchExtentsAdd
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("batch append extent: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("appendExtentKeys: batch append extent: packet(%v) mp(%v) req(%v)", packet, mp, *req)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batch append extent: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batch append extent: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("batch append extent: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) appendObjExtentKeys(mp *MetaPartition, inode uint64, extents []proto.ObjExtentKey) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("appendObjExtentKeys", err, bgTime, 1)
}()
req := &proto.AppendObjExtentKeysRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Inode: inode,
Extents: extents,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchObjExtentsAdd
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("batch append obj extents: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("appendObjExtentKeys: batch append obj extents: packet(%v) mp(%v) req(%v)", packet, mp, *req)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batch append obj extents: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batch append obj extents: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("batch append obj extents: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) batchSetXAttr(mp *MetaPartition, inode uint64, attrs map[string]string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("batchSetXAttr", err, bgTime, 1)
}()
req := &proto.BatchSetXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
Attrs: make(map[string]string),
}
for key, val := range attrs {
req.Attrs[key] = val
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchSetXAttr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("batchSetXAttr: matshal packet fail, err(%v)", err)
return
}
log.LogDebugf("batchSetXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batchSetXAttr: send to partition fail, packet(%v) mp(%v) req(%v) err(%v)",
packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batchSetXAttr: received fail status, packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("batchSetXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) setXAttr(mp *MetaPartition, inode uint64, name []byte, value []byte) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("setXAttr", err, bgTime, 1)
}()
req := &proto.SetXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
Key: string(name),
Value: string(value),
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaSetXAttr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("setXAttr: matshal packet fail, err(%v)", err)
return
}
log.LogDebugf("setXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("setXAttr: send to partition fail, packet(%v) mp(%v) req(%v) err(%v)",
packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("setXAttr: received fail status, packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("setXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) getAllXAttr(mp *MetaPartition, inode uint64) (attrs map[string]string, status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getAllXAttr", err, bgTime, 1)
}()
req := &proto.GetAllXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaGetAllXAttr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("getAllXAttr: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("getAllXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getAllXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("getAllXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.GetAllXAttrResponse)
if err = packet.UnmarshalData(resp); err != nil {
log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
attrs = resp.Attrs
log.LogDebugf("getAllXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) getXAttr(mp *MetaPartition, inode uint64, name string) (value string, status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getXAttr", err, bgTime, 1)
}()
req := &proto.GetXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
Key: name,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaGetXAttr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("get xattr: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("get xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.GetXAttrResponse)
if err = packet.UnmarshalData(resp); err != nil {
log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
value = resp.Value
log.LogDebugf("get xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) removeXAttr(mp *MetaPartition, inode uint64, name string) (status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("removeXAttr", err, bgTime, 1)
}()
req := &proto.RemoveXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
Key: name,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaRemoveXAttr
packet.PartitionID = mp.PartitionID
if err = packet.MarshalData(req); err != nil {
log.LogErrorf("remove xattr: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("remove xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
if packet, err = mw.sendToMetaPartition(mp, packet); err != nil {
log.LogErrorf("remove xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("remove xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
log.LogDebugf("remove xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) listXAttr(mp *MetaPartition, inode uint64) (keys []string, status int, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("listXAttr", err, bgTime, 1)
}()
req := &proto.ListXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaListXAttr
packet.PartitionID = mp.PartitionID
if err = packet.MarshalData(req); err != nil {
log.LogErrorf("list xattr: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("list xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
if packet, err = mw.sendToMetaPartition(mp, packet); err != nil {
log.LogErrorf("list xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("list xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.ListXAttrResponse)
if err = packet.UnmarshalData(resp); err != nil {
log.LogErrorf("list xattr: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
keys = resp.XAttrs
log.LogDebugf("list xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
func (mw *MetaWrapper) listMultiparts(mp *MetaPartition, prefix, delimiter, keyMarker string, multipartIdMarker string, maxUploads uint64) (status int, sessions *proto.ListMultipartResponse, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("listMultiparts", err, bgTime, 1)
}()
req := &proto.ListMultipartRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Marker: keyMarker,
MultipartIdMarker: multipartIdMarker,
Max: maxUploads,
Delimiter: delimiter,
Prefix: prefix,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpListMultiparts
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("list sessions : err(%v)", err)
return
}
log.LogDebugf("listMultiparts enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("listMultiparts: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("listMultiparts: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.ListMultipartResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("listMultiparts: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
return statusOK, resp, nil
}
func (mw *MetaWrapper) batchGetXAttr(mp *MetaPartition, inodes []uint64, keys []string) ([]*proto.XAttrInfo, error) {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("batchGetXAttr", err, bgTime, 1)
}()
req := &proto.BatchGetXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inodes: inodes,
Keys: keys,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchGetXAttr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
return nil, err
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batchGetXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return nil, err
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batchIget: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return nil, err
}
resp := new(proto.BatchGetXAttrResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("batchIget: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return nil, err
}
return resp.XAttrs, nil
}
func (mw *MetaWrapper) readdironly(mp *MetaPartition, parentID uint64) (status int, children []proto.Dentry, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("readdironly", err, bgTime, 1)
}()
req := &proto.ReadDirOnlyRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
ParentID: parentID,
VerSeq: mw.VerReadSeq,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaReadDirOnly
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("readDir: req(%v) err(%v)", *req, err)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
children = make([]proto.Dentry, 0)
log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.ReadDirOnlyResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("readDir: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
log.LogDebugf("readDir: packet(%v) mp(%v) req(%v)", packet, mp, *req)
return statusOK, resp.Children, nil
}
func (mw *MetaWrapper) updateXAttrs(mp *MetaPartition, inode uint64, filesInc int64, dirsInc int64, bytesInc int64) error {
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("updateXAttrs", err, bgTime, 1)
}()
value := strconv.FormatInt(int64(filesInc), 10) + "," + strconv.FormatInt(int64(dirsInc), 10) + "," + strconv.FormatInt(int64(bytesInc), 10)
req := &proto.UpdateXAttrRequest{
VolName: mw.volname,
PartitionId: mp.PartitionID,
Inode: inode,
Key: SummaryKey,
Value: value,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaUpdateXAttr
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("updateXAttr: matshal packet fail, err(%v)", err)
return err
}
log.LogDebugf("updateXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("readdironly: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return err
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("readdironly: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return err
}
log.LogDebugf("updateXAttrs: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return nil
}
func (mw *MetaWrapper) batchSetInodeQuota(mp *MetaPartition, inodes []uint64, quotaId uint32,
IsRoot bool) (resp *proto.BatchSetMetaserverQuotaResponse, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("batchSetInodeQuota", err, bgTime, 1)
}()
req := &proto.BatchSetMetaserverQuotaReuqest{
PartitionId: mp.PartitionID,
Inodes: inodes,
QuotaId: quotaId,
IsRoot: IsRoot,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchSetInodeQuota
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("batchSetInodeQuota MarshalData req [%v] fail.", req)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp = new(proto.BatchSetMetaserverQuotaResponse)
resp.InodeRes = make(map[uint64]uint8, 0)
if err = packet.UnmarshalData(resp); err != nil {
log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
log.LogInfof("batchSetInodeQuota inodes [%v] quota [%v] resp [%v] success.", inodes, quotaId, resp)
return
}
func (mw *MetaWrapper) batchDeleteInodeQuota(mp *MetaPartition, inodes []uint64,
quotaId uint32) (resp *proto.BatchDeleteMetaserverQuotaResponse, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("batchDeleteInodeQuota", err, bgTime, 1)
}()
req := &proto.BatchDeleteMetaserverQuotaReuqest{
PartitionId: mp.PartitionID,
Inodes: inodes,
QuotaId: quotaId,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaBatchDeleteInodeQuota
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("batchDeleteInodeQuota MarshalData req [%v] fail.", req)
return
}
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("batchDeleteInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("batchDeleteInodeQuota: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp = new(proto.BatchDeleteMetaserverQuotaResponse)
resp.InodeRes = make(map[uint64]uint8, 0)
if err = packet.UnmarshalData(resp); err != nil {
log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
log.LogInfof("batchDeleteInodeQuota inodes [%v] quota [%v] resp [%v] success.",
inodes, quotaId, resp)
return
}
func (mw *MetaWrapper) getInodeQuota(mp *MetaPartition, inode uint64) (quotaInfos map[uint32]*proto.MetaQuotaInfo, err error) {
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("getInodeQuota", err, bgTime, 1)
}()
req := &proto.GetInodeQuotaRequest{
PartitionId: mp.PartitionID,
Inode: inode,
}
qcInfo := mw.qc.Get(inode)
if qcInfo != nil {
return qcInfo.quotaInfos, nil
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaGetInodeQuota
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("getInodeQuota: req(%v) err(%v)", *req, err)
return
}
log.LogDebugf("getInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
metric := exporter.NewTPCnt(packet.GetOpMsg())
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("getInodeQuota: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.GetInodeQuotaResponse)
if err = packet.UnmarshalData(resp); err != nil {
log.LogErrorf("getInodeQuota: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
return
}
quotaInfos = resp.MetaQuotaInfoMap
var qinfo QuotaCacheInfo
qinfo.quotaInfos = make(map[uint32]*proto.MetaQuotaInfo)
qinfo.quotaInfos = quotaInfos
qinfo.inode = inode
mw.qc.Put(inode, &qinfo)
log.LogDebugf("getInodeQuota: req(%v) resp(%v) err(%v)", *req, *resp, err)
return
}
func (mw *MetaWrapper) applyQuota(parentIno uint64, quotaId uint32, totalInodeCount *uint64, curInodeCount *uint64, inodes *[]uint64,
maxInodes uint64, first bool) (err error) {
if first {
var rootInodes []uint64
var ret map[uint64]uint8
rootInodes = append(rootInodes, parentIno)
ret, err = mw.BatchSetInodeQuota_ll(rootInodes, quotaId, true)
if err != nil {
return
}
if status, ok := ret[parentIno]; ok {
if status != proto.OpOk {
if status == proto.OpNotExistErr {
err = fmt.Errorf("apply inode %v is not exist.", parentIno)
} else {
err = fmt.Errorf("apply inode %v failed, status: %v.", parentIno, status)
}
return
}
}
*totalInodeCount = *totalInodeCount + 1
}
var defaultReaddirLimit uint64 = 1024
noMore := false
from := ""
for !noMore {
entries, err := mw.ReadDirLimit_ll(parentIno, from, defaultReaddirLimit)
if err != nil {
return err
}
entryNum := uint64(len(entries))
if entryNum == 0 || (from != "" && entryNum == 1) {
break
}
if entryNum < defaultReaddirLimit {
noMore = true
}
if from != "" {
entries = entries[1:]
}
for _, entry := range entries {
*inodes = append(*inodes, entry.Inode)
*curInodeCount = *curInodeCount + 1
*totalInodeCount = *totalInodeCount + 1
if *curInodeCount >= maxInodes {
mw.BatchSetInodeQuota_ll(*inodes, quotaId, false)
*curInodeCount = 0
*inodes = (*inodes)[:0]
}
if proto.IsDir(entry.Type) {
err = mw.applyQuota(entry.Inode, quotaId, totalInodeCount, curInodeCount, inodes, maxInodes, false)
if err != nil {
return err
}
}
}
from = entries[len(entries)-1].Name
}
if first && *curInodeCount > 0 {
mw.BatchSetInodeQuota_ll(*inodes, quotaId, false)
*curInodeCount = 0
*inodes = (*inodes)[:0]
}
return
}
func (mw *MetaWrapper) revokeQuota(parentIno uint64, quotaId uint32, totalInodeCount *uint64, curInodeCount *uint64, inodes *[]uint64,
maxInodes uint64, first bool) (err error) {
if first {
var rootInodes []uint64
rootInodes = append(rootInodes, parentIno)
_, err = mw.BatchDeleteInodeQuota_ll(rootInodes, quotaId)
if err != nil {
return
}
*totalInodeCount = *totalInodeCount + 1
}
var defaultReaddirLimit uint64 = 1024
noMore := false
from := ""
for !noMore {
entries, err := mw.ReadDirLimit_ll(parentIno, from, defaultReaddirLimit)
if err != nil {
return err
}
entryNum := uint64(len(entries))
if entryNum == 0 || (from != "" && entryNum == 1) {
break
}
if entryNum < defaultReaddirLimit {
noMore = true
}
if from != "" {
entries = entries[1:]
}
for _, entry := range entries {
*inodes = append(*inodes, entry.Inode)
*curInodeCount = *curInodeCount + 1
*totalInodeCount = *totalInodeCount + 1
if *curInodeCount >= maxInodes {
mw.BatchDeleteInodeQuota_ll(*inodes, quotaId)
*curInodeCount = 0
*inodes = (*inodes)[:0]
}
if proto.IsDir(entry.Type) {
err = mw.revokeQuota(entry.Inode, quotaId, totalInodeCount, curInodeCount, inodes, maxInodes, false)
if err != nil {
return err
}
}
}
from = entries[len(entries)-1].Name
}
if first && *curInodeCount > 0 {
mw.BatchDeleteInodeQuota_ll(*inodes, quotaId)
*curInodeCount = 0
*inodes = (*inodes)[:0]
}
return
}
func (mw *MetaWrapper) consumeUniqID(mp *MetaPartition) (status int, uniqid uint64, err error) {
pid := mp.PartitionID
mw.uniqidRangeMutex.Lock()
defer mw.uniqidRangeMutex.Unlock()
id, ok := mw.uniqidRangeMap[pid]
if ok {
if id.cur < id.end {
status = statusOK
uniqid = id.cur
id.cur = id.cur + 1
return
}
}
status, start, err := mw.getUniqID(mp, maxUniqID)
if err != nil || status != statusOK {
return status, 0, err
}
uniqid = start
if ok {
id.cur = start + 1
id.end = start + maxUniqID
} else {
mw.uniqidRangeMap[pid] = &uniqidRange{start + 1, start + maxUniqID}
}
return
}
func (mw *MetaWrapper) getUniqID(mp *MetaPartition, num uint32) (status int, start uint64, err error) {
req := &proto.GetUniqIDRequest{
VolName: mw.volname,
PartitionID: mp.PartitionID,
Num: num,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpMetaGetUniqID
packet.PartitionID = mp.PartitionID
err = packet.MarshalData(req)
if err != nil {
return
}
packet, err = mw.sendToMetaPartition(mp, packet)
if err != nil {
log.LogErrorf("getUniqID: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
return
}
status = parseStatus(packet.ResultCode)
if status != statusOK {
log.LogErrorf("getUniqID: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
return
}
resp := new(proto.GetUniqIDResponse)
err = packet.UnmarshalData(resp)
if err != nil {
log.LogErrorf("getUniqID: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
return
}
start = resp.Start
return
}
func (mw *MetaWrapper) checkVerFromMeta(packet *proto.Packet) {
if packet.VerSeq <= mw.Client.GetLatestVer() {
return
}
log.LogDebugf("checkVerFromMeta.UpdateLatestVer.try update meta wrapper verSeq from %v to %v verlist[%v]", mw.Client.GetLatestVer(), packet.VerSeq, packet.VerList)
mw.Client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: packet.VerList})
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
"fmt"
"github.com/cubefs/cubefs/util/btree"
)
type MetaPartition struct {
PartitionID uint64
Start uint64
End uint64
Members []string
LeaderAddr string
Status int8
}
func (this *MetaPartition) Less(than btree.Item) bool {
that := than.(*MetaPartition)
return this.Start < that.Start
}
func (mp *MetaPartition) Copy() btree.Item {
return mp
}
func (mp *MetaPartition) String() string {
return fmt.Sprintf("PartitionID(%v) Start(%v) End(%v) Members(%v) LeaderAddr(%v) Status(%v)", mp.PartitionID, mp.Start, mp.End, mp.Members, mp.LeaderAddr, mp.Status)
}
// Meta partition managements
//
func (mw *MetaWrapper) addPartition(mp *MetaPartition) {
mw.partitions[mp.PartitionID] = mp
mw.ranges.ReplaceOrInsert(mp)
}
func (mw *MetaWrapper) deletePartition(mp *MetaPartition) {
delete(mw.partitions, mp.PartitionID)
mw.ranges.Delete(mp)
}
func (mw *MetaWrapper) replaceOrInsertPartition(mp *MetaPartition) {
mw.Lock()
defer mw.Unlock()
found, ok := mw.partitions[mp.PartitionID]
if ok {
mw.deletePartition(found)
}
mw.addPartition(mp)
return
}
func (mw *MetaWrapper) getPartitionByID(id uint64) *MetaPartition {
mw.RLock()
defer mw.RUnlock()
mp, ok := mw.partitions[id]
if !ok {
return nil
}
return mp
}
func (mw *MetaWrapper) getPartitionByInode(ino uint64) *MetaPartition {
var mp *MetaPartition
mw.RLock()
defer mw.RUnlock()
pivot := &MetaPartition{Start: ino}
mw.ranges.DescendLessOrEqual(pivot, func(i btree.Item) bool {
mp = i.(*MetaPartition)
if ino > mp.End || ino < mp.Start {
mp = nil
}
// Iterate one item is enough
return false
})
return mp
}
//func (mw *MetaWrapper) getRWPartitions() []*MetaPartition {
// rwPartitions := make([]*MetaPartition, 0)
// mw.RLock()
// defer mw.RUnlock()
// for _, mp := range mw.partitions {
// if mp.Status == proto.ReadWrite {
// rwPartitions = append(rwPartitions, mp)
// }
// }
// return rwPartitions
//}
func (mw *MetaWrapper) getRWPartitions() []*MetaPartition {
mw.RLock()
defer mw.RUnlock()
rwPartitions := mw.rwPartitions
if len(rwPartitions) == 0 {
rwPartitions = make([]*MetaPartition, 0)
for _, mp := range mw.partitions {
rwPartitions = append(rwPartitions, mp)
}
}
return rwPartitions
}
// GetConnect the partition whose Start is Larger than ino.
// Return nil if no successive partition.
func (mw *MetaWrapper) getNextPartition(ino uint64) *MetaPartition {
var mp *MetaPartition
mw.RLock()
defer mw.RUnlock()
pivot := &MetaPartition{Start: ino + 1}
mw.ranges.AscendGreaterOrEqual(pivot, func(i btree.Item) bool {
mp = i.(*MetaPartition)
return false
})
return mp
}
func (mw *MetaWrapper) getLatestPartition() *MetaPartition {
mw.RLock()
defer mw.RUnlock()
return mw.ranges.Max().(*MetaPartition)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
"container/list"
"sync"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
const (
MinQuotaCacheEvictNum = 10
)
type QuotaCache struct {
sync.RWMutex
cache map[uint64]*list.Element
lruList *list.List
expiration time.Duration
maxElements int
}
type QuotaCacheInfo struct {
quotaInfos map[uint32]*proto.MetaQuotaInfo
expiration int64
inode uint64
}
func NewQuotaCache(exp time.Duration, maxElements int) *QuotaCache {
qc := &QuotaCache{
cache: make(map[uint64]*list.Element),
lruList: list.New(),
expiration: exp,
maxElements: maxElements,
}
go qc.backgroundEviction()
return qc
}
func (qc *QuotaCache) Put(ino uint64, qinfo *QuotaCacheInfo) {
qc.Lock()
defer qc.Unlock()
old, ok := qc.cache[ino]
if ok {
qc.lruList.Remove(old)
delete(qc.cache, ino)
}
if qc.lruList.Len() >= qc.maxElements {
qc.evict(true)
}
qinfo.quotaSetExpiration(qc.expiration)
element := qc.lruList.PushFront(qinfo)
qc.cache[ino] = element
}
func (qc *QuotaCache) Get(ino uint64) *QuotaCacheInfo {
qc.RLock()
defer qc.RUnlock()
element, ok := qc.cache[ino]
if !ok {
return nil
}
info := element.Value.(*QuotaCacheInfo)
if info.quotaExpired() {
return nil
}
return info
}
func (qc *QuotaCache) Delete(ino uint64) {
qc.Lock()
defer qc.Unlock()
element, ok := qc.cache[ino]
if ok {
qc.lruList.Remove(element)
delete(qc.cache, ino)
}
}
func (qc *QuotaCache) evict(foreground bool) {
for i := 0; i < MinQuotaCacheEvictNum; i++ {
element := qc.lruList.Back()
if element == nil {
return
}
info := element.Value.(*QuotaCacheInfo)
if !foreground && !info.quotaExpired() {
return
}
qc.lruList.Remove(element)
delete(qc.cache, info.inode)
}
// For background eviction, we need to continue evict all expired items from the cache
if foreground {
return
}
for i := 0; i < qc.maxElements; i++ {
element := qc.lruList.Back()
if element == nil {
break
}
info := element.Value.(*QuotaCacheInfo)
if !info.quotaExpired() {
break
}
qc.lruList.Remove(element)
delete(qc.cache, info.inode)
}
}
func (qc *QuotaCache) backgroundEviction() {
t := time.NewTicker(qc.expiration)
defer t.Stop()
for range t.C {
log.LogInfof("QuotaCache: start BG evict")
qc.Lock()
qc.evict(false)
qc.Unlock()
log.LogInfof("QuotaCache: end BG evict")
}
}
func (qinfo *QuotaCacheInfo) quotaSetExpiration(expiration time.Duration) {
qinfo.expiration = time.Now().Add(expiration).UnixNano()
}
func (qinfo *QuotaCacheInfo) quotaExpired() bool {
return time.Now().UnixNano() > qinfo.expiration
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package meta
import (
"errors"
"fmt"
"sync"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/exporter"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/stat"
)
type Transaction struct {
txInfo *proto.TransactionInfo
Started bool
status int
onCommitFuncs []func()
onRollbackFuncs []func()
sync.RWMutex
}
func (tx *Transaction) SetTxID(clientId uint64) {
tx.txInfo.TxID = genTransactionId(clientId)
}
func (tx *Transaction) GetTxID() string {
tx.RLock()
defer tx.RUnlock()
return tx.txInfo.TxID
}
func (tx *Transaction) SetTmID(tmID uint64) {
tx.txInfo.TmID = int64(tmID)
}
func (tx *Transaction) AddInode(inode *proto.TxInodeInfo) error {
tx.Lock()
defer tx.Unlock()
if tx.Started {
return errors.New("transaction already started")
} else {
tx.txInfo.TxInodeInfos[inode.GetKey()] = inode
}
return nil
}
func (tx *Transaction) AddDentry(dentry *proto.TxDentryInfo) error {
tx.Lock()
defer tx.Unlock()
if tx.Started {
return errors.New("transaction already started")
} else {
tx.txInfo.TxDentryInfos[dentry.GetKey()] = dentry
}
return nil
}
// NewTransaction returns a `Transaction` with a timeout(seconds) duration after which the transaction
// will be rolled back if it has not completed yet
func NewTransaction(timeout int64, txType uint32) (tx *Transaction) {
if timeout == 0 {
timeout = proto.DefaultTransactionTimeout
}
return &Transaction{
onCommitFuncs: make([]func(), 0),
onRollbackFuncs: make([]func(), 0),
txInfo: proto.NewTransactionInfo(timeout, txType),
}
}
func (tx *Transaction) OnExecuted(status int, respTxInfo *proto.TransactionInfo) {
tx.Lock()
defer tx.Unlock()
tx.status = status
if tx.status == statusOK {
if !tx.Started {
tx.Started = true
}
if tx.txInfo.TxID == "" && respTxInfo != nil {
tx.txInfo = respTxInfo
}
}
}
func (tx *Transaction) SetOnCommit(job func()) {
tx.onCommitFuncs = append(tx.onCommitFuncs, job)
}
func (tx *Transaction) SetOnRollback(job func()) {
tx.onRollbackFuncs = append(tx.onRollbackFuncs, job)
// tx.onRollback = job
}
func (tx *Transaction) OnDone(err error, mw *MetaWrapper) (newErr error) {
// commit or rollback depending on status
newErr = err
if !tx.Started {
return
}
if err != nil {
log.LogDebugf("OnDone: rollback, tx %s", tx.txInfo.TxID)
tx.Rollback(mw)
} else {
log.LogDebugf("OnDone: commit, tx %s", tx.txInfo.TxID)
newErr = tx.Commit(mw)
}
return
}
// Commit will notify all the RM(related metapartitions) that transaction is completed successfully,
// and corresponding transaction items can be removed
func (tx *Transaction) Commit(mw *MetaWrapper) (err error) {
tmMP := mw.getPartitionByID(uint64(tx.txInfo.TmID))
if tmMP == nil {
log.LogErrorf("Transaction commit: No TM partition, TmID(%v), txID(%v)", tx.txInfo.TmID, tx.txInfo.TxID)
return fmt.Errorf("transaction commit: can't find target mp for tx, mpId %d", tx.txInfo.TmID)
}
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txCommit", err, bgTime, 1)
}()
metric := exporter.NewTPCnt("OpTxCommit")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
req := &proto.TxApplyRequest{
TxID: tx.txInfo.TxID,
TmID: uint64(tx.txInfo.TmID),
TxApplyType: proto.TxCommit,
// TxInfo: tx.txInfo,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpTxCommit
packet.PartitionID = tmMP.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("Transaction commit: TmID(%v), txID(%v), req(%v) err(%v)",
tx.txInfo.TmID, tx.txInfo.TxID, *req, err)
return
}
packet, err = mw.sendToMetaPartition(tmMP, packet)
if err != nil {
log.LogErrorf("Transaction commit: txID(%v), packet(%v) mp(%v) req(%v) err(%v)",
tx.txInfo.TxID, packet, tmMP, *req, err)
return
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
err = errors.New(packet.GetResultMsg())
log.LogErrorf("Transaction commit failed: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
return
}
for _, job := range tx.onCommitFuncs {
job()
}
if log.EnableDebug() {
log.LogDebugf("Transaction commit succesfully: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
}
return
}
// Rollback will notify all the RM(related metapartitions) that transaction is cancelled,
// and corresponding transaction items should be rolled back to previous state(before transaction)
func (tx *Transaction) Rollback(mw *MetaWrapper) {
tmMP := mw.getPartitionByID(uint64(tx.txInfo.TmID))
if tmMP == nil {
log.LogWarnf("Transaction Rollback: No TM partition, TmID(%v), txID(%v)", tx.txInfo.TmID, tx.txInfo.TxID)
return
}
var err error
bgTime := stat.BeginStat()
defer func() {
stat.EndStat("txRollback", err, bgTime, 1)
}()
req := &proto.TxApplyRequest{
TxID: tx.txInfo.TxID,
TmID: uint64(tx.txInfo.TmID),
TxApplyType: proto.TxRollback,
// TxInfo: tx.txInfo,
}
packet := proto.NewPacketReqID()
packet.Opcode = proto.OpTxRollback
packet.PartitionID = tmMP.PartitionID
err = packet.MarshalData(req)
if err != nil {
log.LogErrorf("Transaction Rollback: TmID(%v), txID(%v), req(%v) err(%v)",
tx.txInfo.TmID, tx.txInfo.TxID, *req, err)
return
}
metric := exporter.NewTPCnt("OpTxRollback")
defer func() {
metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
}()
packet, err = mw.sendToMetaPartition(tmMP, packet)
if err != nil {
log.LogErrorf("Transaction Rollback: txID(%v), packet(%v) mp(%v) req(%v) err(%v)",
tx.txInfo.TxID, packet, tmMP, *req, err)
return
}
status := parseStatus(packet.ResultCode)
if status != statusOK {
log.LogErrorf("Transaction Rollback failed: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
return
}
for _, job := range tx.onRollbackFuncs {
job()
}
if log.EnableDebug() {
log.LogDebugf("Transaction Rollback successfully: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k
package meta
import (
"fmt"
"sync/atomic"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
var txId uint64 = 1
func genTransactionId(clientId uint64) string {
return fmt.Sprintf("%d_%d", clientId, atomic.AddUint64(&txId, 1))
}
func getMembersFromMp(parentMp *MetaPartition) string {
members := parentMp.LeaderAddr
for _, addr := range parentMp.Members {
if addr == parentMp.LeaderAddr {
continue
}
if members == "" {
members += addr
} else {
members += "," + addr
}
}
return members
}
func NewCreateTransaction(parentMp, inoMp *MetaPartition, parentID uint64, name string, txTimeout int64, txType uint32) (tx *Transaction, err error) {
// tx = NewTransaction(txTimeout, proto.TxTypeCreate)
tx = NewTransaction(txTimeout, txType)
members := getMembersFromMp(parentMp)
if members == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
inoMembers := getMembersFromMp(inoMp)
txDentryInfo := proto.NewTxDentryInfo(members, parentID, name, parentMp.PartitionID)
txParInoInfo := proto.NewTxInodeInfo(inoMembers, 0, inoMp.PartitionID)
if err = tx.AddDentry(txDentryInfo); err != nil {
return nil, err
}
if err = tx.AddInode(txParInoInfo); err != nil {
return nil, err
}
if log.EnableDebug() {
log.LogDebugf("NewCreateTransaction: txInfo(%v) parentMp", tx.txInfo)
}
return tx, nil
}
func NewDeleteTransaction(
denMp *MetaPartition, parentID uint64, name string,
inoMp *MetaPartition, ino uint64, txTimeout int64) (tx *Transaction, err error) {
tx = NewTransaction(txTimeout, proto.TxTypeRemove)
denMembers := getMembersFromMp(denMp)
if denMembers == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
inoMembers := getMembersFromMp(inoMp)
if inoMembers == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
txInoInfo := proto.NewTxInodeInfo(inoMembers, ino, inoMp.PartitionID)
txDentryInfo := proto.NewTxDentryInfo(denMembers, parentID, name, denMp.PartitionID)
if err = tx.AddInode(txInoInfo); err != nil {
return nil, err
}
if err = tx.AddDentry(txDentryInfo); err != nil {
return nil, err
}
if log.EnableDebug() {
log.LogDebugf("NewDeleteTransaction: tx(%v)", tx)
}
return tx, nil
}
func NewRenameTransaction(srcMp *MetaPartition, srcDenParentID uint64, srcName string,
dstMp *MetaPartition, dstDenParentID uint64, dstName string, txTimeout int64) (tx *Transaction, err error) {
tx = NewTransaction(txTimeout, proto.TxTypeRename)
srcMembers := getMembersFromMp(srcMp)
if srcMembers == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
dstMembers := getMembersFromMp(dstMp)
if dstMembers == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
txSrcDentryInfo := proto.NewTxDentryInfo(srcMembers, srcDenParentID, srcName, srcMp.PartitionID)
txDstDentryInfo := proto.NewTxDentryInfo(dstMembers, dstDenParentID, dstName, dstMp.PartitionID)
if err = tx.AddDentry(txSrcDentryInfo); err != nil {
return nil, err
}
if err = tx.AddDentry(txDstDentryInfo); err != nil {
return nil, err
}
if log.EnableDebug() {
log.LogDebugf("NewRenameTransaction: txInfo(%v)", tx.txInfo)
}
return tx, nil
}
func RenameTxReplaceInode(tx *Transaction, inoMp *MetaPartition, ino uint64) (err error) {
inoMembers := getMembersFromMp(inoMp)
if inoMembers == "" {
return fmt.Errorf("invalid parent metapartition")
}
txInoInfo := proto.NewTxInodeInfo(inoMembers, ino, inoMp.PartitionID)
_ = tx.AddInode(txInoInfo)
log.LogDebugf("RenameTxReplaceInode: txInfo(%v)", tx.txInfo)
return nil
}
func NewLinkTransaction(
denMp *MetaPartition, parentID uint64, name string,
inoMp *MetaPartition, ino uint64, txTimeout int64) (tx *Transaction, err error) {
tx = NewTransaction(txTimeout, proto.TxTypeLink)
denMembers := getMembersFromMp(denMp)
if denMembers == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
inoMembers := getMembersFromMp(inoMp)
if inoMembers == "" {
return nil, fmt.Errorf("invalid parent metapartition")
}
txInoInfo := proto.NewTxInodeInfo(inoMembers, ino, inoMp.PartitionID)
txDentryInfo := proto.NewTxDentryInfo(denMembers, parentID, name, denMp.PartitionID)
if err = tx.AddInode(txInoInfo); err != nil {
return nil, err
}
if err = tx.AddDentry(txDentryInfo); err != nil {
return nil, err
}
if log.EnableDebug() {
log.LogDebugf("NewLinkTransaction: tx(%v)", tx)
}
return tx, nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta
import (
"crypto/md5"
"encoding/base64"
"encoding/hex"
"encoding/json"
"fmt"
"os"
"strings"
"sync/atomic"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/sdk/master"
"github.com/cubefs/cubefs/util/cryptoutil"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
"github.com/jacobsa/daemonize"
)
const (
MaxSendToMaster = 3
)
type VolumeView struct {
Name string
Owner string
MetaPartitions []*MetaPartition
OSSSecure *OSSSecure
CreateTime int64
DeleteLockTime int64
}
type OSSSecure struct {
AccessKey string
SecretKey string
}
type VolStatInfo = proto.VolStatInfo
func (mw *MetaWrapper) fetchVolumeView() (view *VolumeView, err error) {
var vv *proto.VolView
if mw.ownerValidation {
var authKey string
if authKey, err = calculateAuthKey(mw.owner); err != nil {
return
}
if mw.authenticate {
var (
tokenMessage string
ts int64
)
mw.accessToken.Type = proto.MsgMasterFetchVolViewReq
if tokenMessage, ts, err = genMasterToken(mw.accessToken, mw.sessionKey); err != nil {
log.LogWarnf("fetchVolumeView generate token failed: err(%v)", err)
return nil, err
}
var decoder master.Decoder = func(raw []byte) ([]byte, error) {
return mw.parseAndVerifyResp(raw, ts)
}
if vv, err = mw.mc.ClientAPI().GetVolumeWithAuthnode(mw.volname, authKey, tokenMessage, decoder); err != nil {
return
}
} else {
if vv, err = mw.mc.ClientAPI().GetVolume(mw.volname, authKey); err != nil {
return
}
}
} else {
if vv, err = mw.mc.ClientAPI().GetVolumeWithoutAuthKey(mw.volname); err != nil {
return
}
}
if vv.Status == 1 {
log.LogErrorf("fetchVolumeView: volume has been marked for deletion: volume(%v) status(%v - 0:normal/1:markDelete)",
vv.Name, vv.Status)
return nil, proto.ErrVolNotExists
}
convert := func(volView *proto.VolView) *VolumeView {
result := &VolumeView{
Name: volView.Name,
Owner: volView.Owner,
MetaPartitions: make([]*MetaPartition, len(volView.MetaPartitions)),
OSSSecure: &OSSSecure{},
CreateTime: volView.CreateTime,
DeleteLockTime: volView.DeleteLockTime,
}
if volView.OSSSecure != nil {
result.OSSSecure.AccessKey = volView.OSSSecure.AccessKey
result.OSSSecure.SecretKey = volView.OSSSecure.SecretKey
}
for i, mp := range volView.MetaPartitions {
result.MetaPartitions[i] = &MetaPartition{
PartitionID: mp.PartitionID,
Start: mp.Start,
End: mp.End,
Members: mp.Members,
LeaderAddr: mp.LeaderAddr,
Status: mp.Status,
}
}
return result
}
view = convert(vv)
return
}
// fetch and update cluster info if successful
func (mw *MetaWrapper) updateClusterInfo() (err error) {
var info *proto.ClusterInfo
if info, err = mw.mc.AdminAPI().GetClusterInfo(); err != nil {
log.LogWarnf("updateClusterInfo: get cluster info fail: err(%v) volume(%v)", err, mw.volname)
return
}
log.LogInfof("updateClusterInfo: get cluster info: cluster(%v) localIP(%v) volume(%v)",
info.Cluster, info.Ip, mw.volname)
mw.cluster = info.Cluster
mw.localIP = info.Ip
return
}
func (mw *MetaWrapper) updateDirChildrenNumLimit() (err error) {
var clusterInfo *proto.ClusterInfo
clusterInfo, err = mw.mc.AdminAPI().GetClusterInfo()
if err != nil {
return
}
if clusterInfo.DirChildrenNumLimit < proto.MinDirChildrenNumLimit {
log.LogWarnf("updateDirChildrenNumLimit: DirChildrenNumLimit probably not enabled on master, set to default value(%v)",
proto.DefaultDirChildrenNumLimit)
atomic.StoreUint32(&mw.DirChildrenNumLimit, proto.DefaultDirChildrenNumLimit)
} else {
atomic.StoreUint32(&mw.DirChildrenNumLimit, clusterInfo.DirChildrenNumLimit)
log.LogInfof("updateDirChildrenNumLimit: DirChildrenNumLimit(%v)", mw.DirChildrenNumLimit)
}
return
}
func (mw *MetaWrapper) updateVolStatInfo() (err error) {
var info *proto.VolStatInfo
if info, err = mw.mc.ClientAPI().GetVolumeStat(mw.volname); err != nil {
log.LogWarnf("updateVolStatInfo: get volume status fail: volume(%v) err(%v)", mw.volname, err)
return
}
if info.UsedSize > info.TotalSize {
log.LogInfof("volume(%v) queried usedSize(%v) is larger than totalSize(%v), force set usedSize as totalSize",
mw.volname, info.UsedSize, info.TotalSize)
info.UsedSize = info.TotalSize
}
atomic.StoreUint64(&mw.totalSize, info.TotalSize)
atomic.StoreUint64(&mw.usedSize, info.UsedSize)
atomic.StoreUint64(&mw.inodeCount, info.InodeCount)
log.LogInfof("VolStatInfo: volume(%v) info(%v)", mw.volname, info)
return
}
func (mw *MetaWrapper) updateMetaPartitions() error {
view, err := mw.fetchVolumeView()
if err != nil {
log.LogInfof("updateMetaPartition volume(%v) error: %v", mw.volname, err.Error())
switch err {
case proto.ErrExpiredTicket:
// TODO: bad logic, remove later (Mofei Zhang)
if e := mw.updateTicket(); e != nil {
log.LogFlush()
daemonize.SignalOutcome(err)
os.Exit(1)
}
log.LogInfof("updateTicket: ok!")
return err
case proto.ErrInvalidTicket:
// TODO: bad logic, remove later (Mofei Zhang)
log.LogFlush()
daemonize.SignalOutcome(err)
os.Exit(1)
default:
return err
}
}
rwPartitions := make([]*MetaPartition, 0)
for _, mp := range view.MetaPartitions {
mw.replaceOrInsertPartition(mp)
log.LogInfof("updateMetaPartition: mp(%v)", mp)
if mp.Status == proto.ReadWrite {
rwPartitions = append(rwPartitions, mp)
}
}
mw.ossSecure = view.OSSSecure
mw.volCreateTime = view.CreateTime
mw.volDeleteLockTime = view.DeleteLockTime
if len(rwPartitions) == 0 {
log.LogInfof("updateMetaPartition: no rw partitions")
return nil
}
mw.Lock()
mw.rwPartitions = rwPartitions
mw.Unlock()
return nil
}
func (mw *MetaWrapper) forceUpdateMetaPartitions() error {
// Only one forceUpdateMetaPartition is allowed in a specific period of time.
if ok := mw.forceUpdateLimit.AllowN(time.Now(), MinForceUpdateMetaPartitionsInterval); !ok {
return errors.New("Force update meta partitions throttled!")
}
return mw.updateMetaPartitions()
}
// Should be protected by partMutex, otherwise the caller might not be signaled.
func (mw *MetaWrapper) triggerAndWaitForceUpdate() {
mw.partMutex.Lock()
select {
case mw.forceUpdate <- struct{}{}:
default:
}
mw.partCond.Wait()
mw.partMutex.Unlock()
}
func (mw *MetaWrapper) refresh() {
var err error
t := time.NewTimer(RefreshMetaPartitionsInterval)
defer t.Stop()
for {
select {
case <-t.C:
if err = mw.updateMetaPartitions(); err != nil {
mw.onAsyncTaskError.OnError(err)
log.LogErrorf("updateMetaPartition fail cause: %v", err)
}
if err = mw.updateVolStatInfo(); err != nil {
mw.onAsyncTaskError.OnError(err)
log.LogErrorf("updateVolStatInfo fail cause: %v", err)
}
if err = mw.updateDirChildrenNumLimit(); err != nil {
mw.onAsyncTaskError.OnError(err)
log.LogErrorf("updateDirChildrenNumLimit fail cause: %v", err)
}
t.Reset(RefreshMetaPartitionsInterval)
case <-mw.forceUpdate:
log.LogInfof("Start forceUpdateMetaPartitions")
mw.partMutex.Lock()
if err = mw.forceUpdateMetaPartitions(); err == nil {
if err = mw.updateVolStatInfo(); err == nil {
t.Reset(RefreshMetaPartitionsInterval)
}
}
mw.partMutex.Unlock()
mw.partCond.Broadcast()
log.LogInfof("End forceUpdateMetaPartitions: err(%v)", err)
case <-mw.closeCh:
return
}
}
}
func calculateAuthKey(key string) (authKey string, err error) {
h := md5.New()
_, err = h.Write([]byte(key))
if err != nil {
log.LogErrorf("action[calculateAuthKey] calculate auth key[%v] failed,err[%v]", key, err)
return
}
cipherStr := h.Sum(nil)
return strings.ToLower(hex.EncodeToString(cipherStr)), nil
}
func genMasterToken(req proto.APIAccessReq, key string) (message string, ts int64, err error) {
var (
sessionKey []byte
data []byte
)
if sessionKey, err = cryptoutil.Base64Decode(key); err != nil {
return
}
if req.Verifier, ts, err = cryptoutil.GenVerifier(sessionKey); err != nil {
return
}
if data, err = json.Marshal(req); err != nil {
return
}
message = base64.StdEncoding.EncodeToString(data)
return
}
func (mw *MetaWrapper) updateTicket() error {
ticket, err := mw.ac.API().GetTicket(mw.owner, mw.ticketMess.ClientKey, proto.MasterServiceID)
if err != nil {
return errors.Trace(err, "Update ticket from authnode failed!")
}
mw.accessToken.Ticket = ticket.Ticket
mw.sessionKey = ticket.SessionKey
return nil
}
func (mw *MetaWrapper) parseAndVerifyResp(body []byte, ts int64) (dataBody []byte, err error) {
var resp proto.MasterAPIAccessResp
if resp, err = mw.parseRespWithAuth(body); err != nil {
log.LogWarnf("fetchVolumeView parse response failed: err(%v) body(%v)", err, string(body))
return nil, err
}
if err = proto.VerifyAPIRespComm(&(resp.APIResp), mw.accessToken.Type, mw.owner, proto.MasterServiceID, ts); err != nil {
log.LogWarnf("fetchVolumeView verify response: err(%v)", err)
return nil, err
}
viewBody := &struct {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data json.RawMessage
}{}
if err = json.Unmarshal(resp.Data, viewBody); err != nil {
log.LogWarnf("VolViewCache unmarshal: err(%v) body(%v)", err, viewBody)
return nil, err
}
if viewBody.Code != 0 {
return nil, fmt.Errorf("request error, code[%d], msg[%s]", viewBody.Code, viewBody.Msg)
}
return viewBody.Data, err
}
func (mw *MetaWrapper) parseRespWithAuth(body []byte) (resp proto.MasterAPIAccessResp, err error) {
var (
message string
sessionKey []byte
plaintext []byte
)
if err = json.Unmarshal(body, &message); err != nil {
return
}
if sessionKey, err = cryptoutil.Base64Decode(mw.sessionKey); err != nil {
return
}
if plaintext, err = cryptoutil.DecodeMessage(message, sessionKey); err != nil {
return
}
if err = json.Unmarshal(plaintext, &resp); err != nil {
return
}
return
}
func (mw *MetaWrapper) updateQuotaInfoTick() {
mw.updateQuotaInfo()
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
mw.updateQuotaInfo()
case <-mw.closeCh:
return
}
}
}
func (mw *MetaWrapper) updateQuotaInfo() {
var volumeInfo *proto.SimpleVolView
volumeInfo, err := mw.mc.AdminAPI().GetVolumeSimpleInfo(mw.volname)
if err != nil {
return
}
mw.EnableQuota = volumeInfo.EnableQuota
if !mw.EnableQuota {
return
}
quotaInfos, err := mw.mc.AdminAPI().ListQuota(mw.volname)
if err != nil {
log.LogWarnf("updateQuotaInfo get quota info fail: vol [%v] err [%v]", mw.volname, err)
return
}
mw.QuotaLock.Lock()
defer mw.QuotaLock.Unlock()
mw.QuotaInfoMap = make(map[uint32]*proto.QuotaInfo)
for _, info := range quotaInfos {
mw.QuotaInfoMap[info.QuotaId] = info
log.LogDebugf("updateQuotaInfo quotaInfo [%v]", info)
}
}
func (mw *MetaWrapper) IsQuotaLimited(quotaIds []uint32) bool {
mw.QuotaLock.RLock()
defer mw.QuotaLock.RUnlock()
for _, quotaId := range quotaIds {
if info, isFind := mw.QuotaInfoMap[quotaId]; isFind {
if info.LimitedInfo.LimitedBytes {
log.LogDebugf("IsQuotaLimited quotaId [%v]", quotaId)
return true
}
}
log.LogDebugf("IsQuotaLimited false quota [%v]", quotaId)
}
return false
}
func (mw *MetaWrapper) GetQuotaFullPaths() (fullPaths []string) {
fullPaths = make([]string, 0, 0)
mw.QuotaLock.RLock()
defer mw.QuotaLock.RUnlock()
for _, info := range mw.QuotaInfoMap {
for _, pathInfo := range info.PathInfos {
fullPaths = append(fullPaths, pathInfo.FullPath)
}
}
return fullPaths
}
func (mw *MetaWrapper) IsQuotaLimitedById(inodeId uint64, size bool, files bool) bool {
mp := mw.getPartitionByInode(inodeId)
if mp == nil {
log.LogErrorf("IsQuotaLimitedById: inodeId(%v)", inodeId)
return true
}
quotaInfos, err := mw.getInodeQuota(mp, inodeId)
if err != nil {
log.LogErrorf("IsQuotaLimitedById: get parent quota fail, inodeId(%v) err(%v)", inodeId, err)
return true
}
for quotaId := range quotaInfos {
if info, isFind := mw.QuotaInfoMap[quotaId]; isFind {
if size && info.LimitedInfo.LimitedBytes {
log.LogDebugf("IsQuotaLimitedById quotaId [%v]", quotaId)
return true
}
if files && info.LimitedInfo.LimitedFiles {
log.LogDebugf("IsQuotaLimitedById quotaId [%v]", quotaId)
return true
}
}
log.LogDebugf("IsQuotaLimitedById false quota [%v]", quotaId)
}
return false
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"errors"
"fmt"
)
var (
ExtentHasBeenDeletedError = errors.New("extent has been deleted")
ParameterMismatchError = errors.New("parameter mismatch error")
NoAvailableExtentError = errors.New("no available extent")
NoBrokenExtentError = errors.New("no unavailable extent")
NoSpaceError = errors.New("no space left on the device")
TryAgainError = errors.New("try again")
CrcMismatchError = errors.New("packet Crc is incorrect")
NoLeaderError = errors.New("no raft leader")
ExtentNotFoundError = errors.New("extent does not exist")
ExtentExistsError = errors.New("extent already exists")
ExtentIsFullError = errors.New("extent is full")
BrokenExtentError = errors.New("extent has been broken")
BrokenDiskError = errors.New("disk has broken")
ForbidWriteError = errors.New("single replica decommission forbid write")
VerNotConsistentError = errors.New("ver not consistent")
SnapshotNeedNewExtentError = errors.New("snapshot need new extent error")
)
func newParameterError(format string, a ...interface{}) error {
return fmt.Errorf("parameter mismatch error: %s", fmt.Sprintf(format, a...))
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"math"
"os"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
const (
ExtentOpenOpt = os.O_CREATE | os.O_RDWR | os.O_EXCL
ExtentHasClose = -1
SEEK_DATA = 3
SEEK_HOLE = 4
)
const (
ExtentMaxSize = 1024 * 1024 * 1024 * 1024 * 4 // 4TB
)
type ExtentInfo struct {
FileID uint64 `json:"fileId"`
Size uint64 `json:"size"`
Crc uint32 `json:"Crc"`
IsDeleted bool `json:"deleted"`
ModifyTime int64 `json:"modTime"` // random write not update modify time
AccessTime int64 `json:"accessTime"`
Source string `json:"src"`
SnapshotDataOff uint64 `json:"snapSize"`
SnapPreAllocDataOff uint64 `json:"snapPreAllocSize"`
ApplyID uint64 `json:"applyID"`
}
func (ei *ExtentInfo) TotalSize() uint64 {
if ei.SnapshotDataOff > util.ExtentSize {
return ei.Size + (ei.SnapshotDataOff - util.ExtentSize)
}
return ei.Size
}
func (ei *ExtentInfo) String() (m string) {
source := ei.Source
if source == "" {
source = "none"
}
return fmt.Sprintf("%v_%v_%v_%v_%v_%d_%d_%d", ei.FileID, ei.Size, ei.SnapshotDataOff, ei.IsDeleted, source, ei.ModifyTime, ei.AccessTime, ei.Crc)
}
// SortedExtentInfos defines an array sorted by AccessTime
type SortedExtentInfos []*ExtentInfo
func (extInfos SortedExtentInfos) Len() int {
return len(extInfos)
}
func (extInfos SortedExtentInfos) Less(i, j int) bool {
return extInfos[i].AccessTime < extInfos[j].AccessTime
}
func (extInfos SortedExtentInfos) Swap(i, j int) {
extInfos[i], extInfos[j] = extInfos[j], extInfos[i]
}
// Extent is an implementation of Extent for local regular extent file data management.
// This extent implementation manages all header info and data body in one single entry file.
// Header of extent include inode value of this extent block and Crc blocks of data blocks.
type Extent struct {
file *os.File
filePath string
extentID uint64
modifyTime int64
accessTime int64
dataSize int64
hasClose int32
header []byte
snapshotDataOff uint64
sync.Mutex
}
// NewExtentInCore create and returns a new extent instance.
func NewExtentInCore(name string, extentID uint64) *Extent {
e := new(Extent)
e.extentID = extentID
e.filePath = name
e.snapshotDataOff = util.ExtentSize
return e
}
func (e *Extent) String() string {
return fmt.Sprintf("%v_%v_%v", e.filePath, e.dataSize, e.snapshotDataOff)
}
func (e *Extent) GetSize() (int64, uint64) {
return e.dataSize, e.snapshotDataOff
}
func (e *Extent) HasClosed() bool {
return atomic.LoadInt32(&e.hasClose) == ExtentHasClose
}
// Close this extent and release FD.
func (e *Extent) Close() (err error) {
if e.HasClosed() {
return
}
if err = e.file.Close(); err != nil {
return
}
return
}
func (e *Extent) Exist() (exsit bool) {
_, err := os.Stat(e.filePath)
if err != nil {
return os.IsExist(err)
}
return true
}
func (e *Extent) GetFile() *os.File {
return e.file
}
// InitToFS init extent data info filesystem. If entry file exist and overwrite is true,
// this operation will clear all data of exist entry file and initialize extent header data.
func (e *Extent) InitToFS() (err error) {
if e.file, err = os.OpenFile(e.filePath, ExtentOpenOpt, 0o666); err != nil {
return err
}
if IsTinyExtent(e.extentID) {
e.dataSize = 0
return
}
atomic.StoreInt64(&e.modifyTime, time.Now().Unix())
atomic.StoreInt64(&e.accessTime, time.Now().Unix())
e.dataSize = 0
return
}
func (e *Extent) GetDataSize(statSize int64) (dataSize int64) {
var (
dataStart int64
holStart int64
curOff int64
err error
)
for {
// curOff if the hold start and the data end
curOff, err = e.file.Seek(holStart, SEEK_DATA)
if err != nil || curOff >= util.ExtentSize || (holStart > 0 && holStart == curOff) {
log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
break
}
log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
dataStart = curOff
curOff, err = e.file.Seek(dataStart, SEEK_HOLE)
if err != nil || curOff >= util.ExtentSize || dataStart == curOff {
log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
break
}
log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
holStart = curOff
}
log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
if holStart == 0 {
if statSize > util.ExtentSize {
return util.ExtentSize
}
return statSize
}
return holStart
}
// RestoreFromFS restores the entity data and status from the file stored on the filesystem.
func (e *Extent) RestoreFromFS() (err error) {
if e.file, err = os.OpenFile(e.filePath, os.O_RDWR, 0o666); err != nil {
if strings.Contains(err.Error(), syscall.ENOENT.Error()) {
err = ExtentNotFoundError
}
return err
}
var info os.FileInfo
if info, err = e.file.Stat(); err != nil {
err = fmt.Errorf("stat file %v: %v", e.file.Name(), err)
return
}
if IsTinyExtent(e.extentID) {
watermark := info.Size()
if watermark%util.PageSize != 0 {
watermark = watermark + (util.PageSize - watermark%util.PageSize)
}
e.dataSize = watermark
return
}
e.dataSize = e.GetDataSize(info.Size())
e.snapshotDataOff = util.ExtentSize
if info.Size() > util.ExtentSize {
e.snapshotDataOff = uint64(info.Size())
}
atomic.StoreInt64(&e.modifyTime, info.ModTime().Unix())
ts := info.Sys().(*syscall.Stat_t)
atomic.StoreInt64(&e.accessTime, time.Unix(int64(ts.Atim.Sec), int64(ts.Atim.Nsec)).Unix())
return
}
// Size returns length of the extent (not including the header).
func (e *Extent) Size() (size int64) {
return e.dataSize
}
// ModifyTime returns the time when this extent was modified recently.
func (e *Extent) ModifyTime() int64 {
return atomic.LoadInt64(&e.modifyTime)
}
func IsRandomWrite(writeType int) bool {
return writeType == RandomWriteType
}
func IsAppendWrite(writeType int) bool {
return writeType == AppendWriteType
}
func IsAppendRandomWrite(writeType int) bool {
return writeType == AppendRandomWriteType
}
// WriteTiny performs write on a tiny extent.
func (e *Extent) WriteTiny(data []byte, offset, size int64, crc uint32, writeType int, isSync bool) (err error) {
e.Lock()
defer e.Unlock()
index := offset + size
if index >= ExtentMaxSize {
return ExtentIsFullError
}
if IsAppendWrite(writeType) && offset != e.dataSize {
return ParameterMismatchError
}
if _, err = e.file.WriteAt(data[:size], int64(offset)); err != nil {
return
}
if isSync {
if err = e.file.Sync(); err != nil {
return
}
}
if !IsAppendWrite(writeType) {
return
}
if index%util.PageSize != 0 {
index = index + (util.PageSize - index%util.PageSize)
}
e.dataSize = index
return
}
// Write writes data to an extent.
func (e *Extent) Write(data []byte, offset, size int64, crc uint32, writeType int, isSync bool, crcFunc UpdateCrcFunc, ei *ExtentInfo) (status uint8, err error) {
log.LogDebugf("action[Extent.Write] path %v offset %v size %v writeType %v", e.filePath, offset, size, writeType)
status = proto.OpOk
if IsTinyExtent(e.extentID) {
err = e.WriteTiny(data, offset, size, crc, writeType, isSync)
return
}
if err = e.checkWriteOffsetAndSize(writeType, offset, size); err != nil {
log.LogErrorf("action[Extent.Write] checkWriteOffsetAndSize offset %v size %v writeType %v err %v",
offset, size, writeType, err)
err = newParameterError("extent current size=%d write offset=%d write size=%d", e.dataSize, offset, size)
log.LogInfof("action[Extent.Write] newParameterError path %v offset %v size %v writeType %v err %v", e.filePath,
offset, size, writeType, err)
status = proto.OpTryOtherExtent
return
}
log.LogDebugf("action[Extent.Write] path %v offset %v size %v writeType %v", e.filePath, offset, size, writeType)
// Check if extent file size matches the write offset just in case
// multiple clients are writing concurrently.
e.Lock()
defer e.Unlock()
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v path %v", offset, size, writeType, e.filePath)
if IsAppendWrite(writeType) && e.dataSize != offset {
err = newParameterError("extent current size=%d write offset=%d write size=%d", e.dataSize, offset, size)
log.LogInfof("action[Extent.Write] newParameterError path %v offset %v size %v writeType %v err %v", e.filePath,
offset, size, writeType, err)
status = proto.OpTryOtherExtent
return
}
if IsAppendRandomWrite(writeType) {
if e.snapshotDataOff <= util.ExtentSize {
log.LogInfof("action[Extent.Write] truncate extent %v offset %v size %v writeType %v truncate err %v", e, offset, size, writeType, err)
if err = e.file.Truncate(util.ExtentSize); err != nil {
log.LogErrorf("action[Extent.Write] offset %v size %v writeType %v truncate err %v", offset, size, writeType, err)
return
}
}
}
if _, err = e.file.WriteAt(data[:size], int64(offset)); err != nil {
log.LogErrorf("action[Extent.Write] offset %v size %v writeType %v err %v", offset, size, writeType, err)
return
}
blockNo := offset / util.BlockSize
offsetInBlock := offset % util.BlockSize
defer func() {
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v path %v", offset, size, writeType, e.filePath)
if IsAppendWrite(writeType) {
atomic.StoreInt64(&e.modifyTime, time.Now().Unix())
e.dataSize = int64(math.Max(float64(e.dataSize), float64(offset+size)))
log.LogDebugf("action[Extent.Write] e %v offset %v size %v writeType %v", e, offset, size, writeType)
} else if IsAppendRandomWrite(writeType) {
atomic.StoreInt64(&e.modifyTime, time.Now().Unix())
e.snapshotDataOff = uint64(math.Max(float64(e.snapshotDataOff), float64(offset+size)))
}
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v dataSize %v snapshotDataOff %v",
offset, size, writeType, e.dataSize, e.snapshotDataOff)
}()
if isSync {
if err = e.file.Sync(); err != nil {
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v err %v",
offset, size, writeType, err)
return
}
}
if offsetInBlock == 0 && size == util.BlockSize {
err = crcFunc(e, int(blockNo), crc)
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v err %v", offset, size, writeType, err)
return
}
if offsetInBlock+size <= util.BlockSize {
err = crcFunc(e, int(blockNo), 0)
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v err %v", offset, size, writeType, err)
return
}
log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v", offset, size, writeType)
if err = crcFunc(e, int(blockNo), 0); err == nil {
err = crcFunc(e, int(blockNo+1), 0)
}
return
}
// Read reads data from an extent.
func (e *Extent) Read(data []byte, offset, size int64, isRepairRead bool) (crc uint32, err error) {
log.LogDebugf("action[Extent.read] offset %v size %v extent %v", offset, size, e)
if IsTinyExtent(e.extentID) {
return e.ReadTiny(data, offset, size, isRepairRead)
}
if err = e.checkReadOffsetAndSize(offset, size); err != nil {
log.LogErrorf("action[Extent.Read] offset %d size %d err %v", offset, size, err)
return
}
var rSize int
if rSize, err = e.file.ReadAt(data[:size], offset); err != nil {
log.LogErrorf("action[Extent.Read] offset %v size %v err %v realsize %v", offset, size, err, rSize)
return
}
crc = crc32.ChecksumIEEE(data)
return
}
// ReadTiny read data from a tiny extent.
func (e *Extent) ReadTiny(data []byte, offset, size int64, isRepairRead bool) (crc uint32, err error) {
_, err = e.file.ReadAt(data[:size], offset)
if isRepairRead && err == io.EOF {
err = nil
}
crc = crc32.ChecksumIEEE(data[:size])
return
}
func (e *Extent) checkReadOffsetAndSize(offset, size int64) error {
if (e.snapshotDataOff == util.ExtentSize && offset > e.Size()) ||
(e.snapshotDataOff > util.ExtentSize && uint64(offset) > e.snapshotDataOff) {
return newParameterError("offset=%d size=%d snapshotDataOff=%d", offset, size, e.snapshotDataOff)
}
return nil
}
func (e *Extent) checkWriteOffsetAndSize(writeType int, offset, size int64) error {
err := newParameterError("writeType=%d offset=%d size=%d", writeType, offset, size)
if IsAppendWrite(writeType) {
if size == 0 || size > util.BlockSize ||
offset+size > util.ExtentSize || offset >= util.ExtentSize {
return err
}
} else if IsAppendRandomWrite(writeType) {
log.LogDebugf("action[checkOffsetAndSize] offset %v size %v", offset, size)
if offset < util.ExtentSize || size == 0 {
return err
}
}
return nil
}
// Flush synchronizes data to the disk.
func (e *Extent) Flush() (err error) {
err = e.file.Sync()
return
}
func (e *Extent) GetCrc(blockNo int64) uint32 {
if int64(len(e.header)) < (blockNo+1)*util.PerBlockCrcSize {
return 0
}
return binary.BigEndian.Uint32(e.header[blockNo*util.PerBlockCrcSize : (blockNo+1)*util.PerBlockCrcSize])
}
func (e *Extent) autoComputeExtentCrc(crcFunc UpdateCrcFunc) (crc uint32, err error) {
var blockCnt int
extSize := e.Size()
if e.snapshotDataOff > util.ExtentSize {
extSize = int64(e.snapshotDataOff)
}
blockCnt = int(extSize / util.BlockSize)
if extSize%util.BlockSize != 0 {
blockCnt += 1
}
log.LogDebugf("autoComputeExtentCrc. path %v extent %v extent size %v,blockCnt %v", e.filePath, e.extentID, extSize, blockCnt)
crcData := make([]byte, blockCnt*util.PerBlockCrcSize)
for blockNo := 0; blockNo < blockCnt; blockNo++ {
blockCrc := binary.BigEndian.Uint32(e.header[blockNo*util.PerBlockCrcSize : (blockNo+1)*util.PerBlockCrcSize])
if blockCrc != 0 {
binary.BigEndian.PutUint32(crcData[blockNo*util.PerBlockCrcSize:(blockNo+1)*util.PerBlockCrcSize], blockCrc)
continue
}
bdata := make([]byte, util.BlockSize)
offset := int64(blockNo * util.BlockSize)
readN, err := e.file.ReadAt(bdata[:util.BlockSize], offset)
if readN == 0 && err != nil {
log.LogErrorf("autoComputeExtentCrc. path %v extent %v blockNo %v, readN %v err %v", e.filePath, e.extentID, blockNo, readN, err)
break
}
blockCrc = crc32.ChecksumIEEE(bdata[:readN])
err = crcFunc(e, blockNo, blockCrc)
if err != nil {
log.LogErrorf("autoComputeExtentCrc. path %v extent %v blockNo %v, err %v", e.filePath, e.extentID, blockNo, err)
return 0, nil
}
log.LogDebugf("autoComputeExtentCrc. path %v extent %v blockCrc %v,blockNo %v", e.filePath, e.extentID, blockCrc, blockNo)
binary.BigEndian.PutUint32(crcData[blockNo*util.PerBlockCrcSize:(blockNo+1)*util.PerBlockCrcSize], blockCrc)
}
crc = crc32.ChecksumIEEE(crcData)
log.LogDebugf("autoComputeExtentCrc. path %v extent %v crc %v", e.filePath, e.extentID, crc)
return crc, err
}
// DeleteTiny deletes a tiny extent.
func (e *Extent) punchDelete(offset, size int64) (hasDelete bool, err error) {
log.LogDebugf("punchDelete extent %v offset %v, size %v", e, offset, size)
if int(offset)%util.PageSize != 0 {
return false, ParameterMismatchError
}
if int(size)%util.PageSize != 0 {
size += int64(util.PageSize - int(size)%util.PageSize)
}
newOffset, err := e.file.Seek(offset, SEEK_DATA)
if err != nil {
if strings.Contains(err.Error(), syscall.ENXIO.Error()) {
return true, nil
}
return false, err
}
if newOffset-offset >= size {
return true, nil
}
log.LogDebugf("punchDelete offset %v size %v", offset, size)
err = fallocate(int(e.file.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize, offset, size)
return
}
func (e *Extent) getRealBlockCnt() (blockNum int64) {
stat := new(syscall.Stat_t)
syscall.Stat(e.filePath, stat)
return stat.Blocks
}
func (e *Extent) TinyExtentRecover(data []byte, offset, size int64, crc uint32, isEmptyPacket bool) (err error) {
e.Lock()
defer e.Unlock()
if !IsTinyExtent(e.extentID) {
return ParameterMismatchError
}
if offset%util.PageSize != 0 || offset != e.dataSize {
return fmt.Errorf("error empty packet on (%v) offset(%v) size(%v)"+
" isEmptyPacket(%v) e.dataSize(%v)", e.file.Name(), offset, size, isEmptyPacket, e.dataSize)
}
log.LogDebugf("before file (%v) getRealBlockNo (%v) isEmptyPacket(%v)"+
"offset(%v) size(%v) e.datasize(%v)", e.filePath, e.getRealBlockCnt(), isEmptyPacket, offset, size, e.dataSize)
if isEmptyPacket {
var finfo os.FileInfo
finfo, err = e.file.Stat()
if err != nil {
return err
}
if offset < finfo.Size() {
return fmt.Errorf("error empty packet on (%v) offset(%v) size(%v)"+
" isEmptyPacket(%v) filesize(%v) e.dataSize(%v)", e.file.Name(), offset, size, isEmptyPacket, finfo.Size(), e.dataSize)
}
if err = syscall.Ftruncate(int(e.file.Fd()), offset+size); err != nil {
return err
}
err = fallocate(int(e.file.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize, offset, size)
} else {
_, err = e.file.WriteAt(data[:size], int64(offset))
}
if err != nil {
return
}
watermark := offset + size
if watermark%util.PageSize != 0 {
watermark = watermark + (util.PageSize - watermark%util.PageSize)
}
e.dataSize = watermark
log.LogDebugf("after file (%v) getRealBlockNo (%v) isEmptyPacket(%v)"+
"offset(%v) size(%v) e.datasize(%v)", e.filePath, e.getRealBlockCnt(), isEmptyPacket, offset, size, e.dataSize)
return
}
func (e *Extent) tinyExtentAvaliOffset(offset int64) (newOffset, newEnd int64, err error) {
e.Lock()
defer e.Unlock()
newOffset, err = e.file.Seek(int64(offset), SEEK_DATA)
if err != nil {
return
}
newEnd, err = e.file.Seek(int64(newOffset), SEEK_HOLE)
if err != nil {
return
}
if newOffset-offset > util.BlockSize {
newOffset = offset + util.BlockSize
}
if newEnd-newOffset > util.BlockSize {
newEnd = newOffset + util.BlockSize
}
if newEnd < newOffset {
err = fmt.Errorf("unavali TinyExtentAvaliOffset on SEEK_DATA or SEEK_HOLE (%v) offset(%v) "+
"newEnd(%v) newOffset(%v)", e.extentID, offset, newEnd, newOffset)
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"container/list"
"sync"
)
// ExtentMapItem stores the extent entity pointer and the element
// pointer of the extent entity in a cache list.
type ExtentMapItem struct {
e *Extent
element *list.Element
}
// ExtentCache is an implementation of the ExtentCache with LRU support.
type ExtentCache struct {
extentMap map[uint64]*ExtentMapItem
extentList *list.List
tinyExtents map[uint64]*Extent
tinyLock sync.RWMutex
lock sync.RWMutex
capacity int
}
// NewExtentCache creates and returns a new ExtentCache instance.
func NewExtentCache(capacity int) *ExtentCache {
return &ExtentCache{
extentMap: make(map[uint64]*ExtentMapItem),
extentList: list.New(),
capacity: capacity,
tinyExtents: make(map[uint64]*Extent),
}
}
// Put puts an extent object into the cache.
func (cache *ExtentCache) Put(e *Extent) {
if IsTinyExtent(e.extentID) {
cache.tinyLock.Lock()
cache.tinyExtents[e.extentID] = e
cache.tinyLock.Unlock()
return
}
cache.lock.Lock()
defer cache.lock.Unlock()
item := &ExtentMapItem{
e: e,
element: cache.extentList.PushBack(e),
}
cache.extentMap[e.extentID] = item
cache.evict()
}
// Get gets the extent from the cache.
func (cache *ExtentCache) Get(extentID uint64) (e *Extent, ok bool) {
if IsTinyExtent(extentID) {
cache.tinyLock.RLock()
e, ok = cache.tinyExtents[extentID]
cache.tinyLock.RUnlock()
return
}
cache.lock.Lock()
defer cache.lock.Unlock()
var item *ExtentMapItem
if item, ok = cache.extentMap[extentID]; ok {
if !IsTinyExtent(extentID) {
cache.extentList.MoveToBack(item.element)
}
e = item.e
}
return
}
// Del deletes the extent stored in the cache.
func (cache *ExtentCache) Del(extentID uint64) {
if IsTinyExtent(extentID) {
return
}
cache.lock.Lock()
defer cache.lock.Unlock()
var (
item *ExtentMapItem
ok bool
)
if item, ok = cache.extentMap[extentID]; ok {
delete(cache.extentMap, extentID)
cache.extentList.Remove(item.element)
item.e.Close()
}
}
// Clear closes all the extents stored in the cache.
func (cache *ExtentCache) Clear() {
cache.tinyLock.RLock()
for _, extent := range cache.tinyExtents {
extent.Close()
}
cache.tinyLock.RUnlock()
cache.lock.Lock()
defer cache.lock.Unlock()
for e := cache.extentList.Front(); e != nil; {
curr := e
e = e.Next()
ec := curr.Value.(*Extent)
delete(cache.extentMap, ec.extentID)
ec.Close()
cache.extentList.Remove(curr)
}
cache.extentList = list.New()
cache.extentMap = make(map[uint64]*ExtentMapItem)
}
// Size returns number of extents stored in the cache.
func (cache *ExtentCache) Size() int {
cache.lock.RLock()
defer cache.lock.RUnlock()
return cache.extentList.Len()
}
func (cache *ExtentCache) evict() {
if cache.capacity <= 0 {
return
}
needRemove := cache.extentList.Len() - cache.capacity
for i := 0; i < needRemove; i++ {
if e := cache.extentList.Front(); e != nil {
front := e.Value.(*Extent)
if IsTinyExtent(front.extentID) {
continue
}
delete(cache.extentMap, front.extentID)
cache.extentList.Remove(e)
front.Close()
}
}
}
// Flush synchronizes the extent stored in the cache to the disk.
func (cache *ExtentCache) Flush() {
cache.tinyLock.RLock()
for _, extent := range cache.tinyExtents {
extent.Flush()
}
cache.tinyLock.RUnlock()
cache.lock.RLock()
defer cache.lock.RUnlock()
for _, item := range cache.extentMap {
item.e.Flush()
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"bytes"
"encoding/binary"
"fmt"
"hash/crc32"
"io"
"os"
"path"
"regexp"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/errors"
"github.com/cubefs/cubefs/util/log"
)
//TODO: remove this later.
//go:generate golangci-lint run --issues-exit-code=1 -D errcheck -E bodyclose ./...
const (
ExtCrcHeaderFileName = "EXTENT_CRC"
ExtBaseExtentIDFileName = "EXTENT_META"
TinyDeleteFileOpt = os.O_CREATE | os.O_RDWR | os.O_APPEND
TinyExtDeletedFileName = "TINYEXTENT_DELETE"
NormalExtDeletedFileName = "NORMALEXTENT_DELETE"
MaxExtentCount = 20000
TinyExtentCount = 64
TinyExtentStartID = 1
MinExtentID = 1024
DeleteTinyRecordSize = 24
UpdateCrcInterval = 600
RepairInterval = 60
RandomWriteType = 2
AppendWriteType = 1
AppendRandomWriteType = 4
NormalExtentDeleteRetainTime = 3600 * 4
StaleExtStoreBackupSuffix = ".old"
StaleExtStoreTimeFormat = "20060102150405.000000000"
)
var (
RegexpExtentFile, _ = regexp.Compile(`^(\d)+$`)
SnapShotFilePool = &sync.Pool{New: func() interface{} {
return new(proto.File)
}}
)
func GetSnapShotFileFromPool() (f *proto.File) {
f = SnapShotFilePool.Get().(*proto.File)
return
}
func PutSnapShotFileToPool(f *proto.File) {
SnapShotFilePool.Put(f)
}
type ExtentFilter func(info *ExtentInfo) bool
// Filters
var (
NormalExtentFilter = func() ExtentFilter {
now := time.Now()
return func(ei *ExtentInfo) bool {
return !IsTinyExtent(ei.FileID) && now.Unix()-ei.ModifyTime > RepairInterval && !ei.IsDeleted
}
}
TinyExtentFilter = func(filters []uint64) ExtentFilter {
return func(ei *ExtentInfo) bool {
if !IsTinyExtent(ei.FileID) {
return false
}
for _, filterID := range filters {
if filterID == ei.FileID {
return true
}
}
return false
}
}
)
// ExtentStore defines fields used in the storage engine.
// Packets smaller than 128K are stored in the "tinyExtent", a place to persist the small files.
// packets larger than or equal to 128K are stored in the normal "extent", a place to persist large files.
// The difference between them is that the extentID of a tinyExtent starts at 5000000 and ends at 5000128.
// Multiple small files can be appended to the same tinyExtent.
// In addition, the deletion of small files is implemented by the punch hole from the underlying file system.
type ExtentStore struct {
dataPath string
baseExtentID uint64 // TODO what is baseExtentID
extentInfoMap map[uint64]*ExtentInfo // map that stores all the extent information
eiMutex sync.RWMutex // mutex for extent info
cache *ExtentCache // extent cache
mutex sync.Mutex
storeSize int // size of the extent store
metadataFp *os.File // metadata file pointer?
tinyExtentDeleteFp *os.File
normalExtentDeleteFp *os.File
closeC chan bool
closed bool
availableTinyExtentC chan uint64 // available tinyExtent channel
availableTinyExtentMap sync.Map
brokenTinyExtentC chan uint64 // broken tinyExtent channel
brokenTinyExtentMap sync.Map
// blockSize int
partitionID uint64
verifyExtentFp *os.File
verifyExtentFpAppend []*os.File
hasAllocSpaceExtentIDOnVerfiyFile uint64
hasDeleteNormalExtentsCache sync.Map
partitionType int
ApplyId uint64
ApplyIdMutex sync.RWMutex
}
func MkdirAll(name string) (err error) {
return os.MkdirAll(name, 0o755)
}
func NewExtentStore(dataDir string, partitionID uint64, storeSize, dpType int, isCreate bool) (s *ExtentStore, err error) {
s = new(ExtentStore)
s.dataPath = dataDir
s.partitionType = dpType
s.partitionID = partitionID
if isCreate {
if err = s.renameStaleExtentStore(); err != nil {
return
}
if err = MkdirAll(dataDir); err != nil {
return nil, fmt.Errorf("NewExtentStore [%v] err[%v]", dataDir, err)
}
if s.tinyExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, TinyExtDeletedFileName), TinyDeleteFileOpt, 0o666); err != nil {
return
}
if s.verifyExtentFp, err = os.OpenFile(path.Join(s.dataPath, ExtCrcHeaderFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
return
}
if s.metadataFp, err = os.OpenFile(path.Join(s.dataPath, ExtBaseExtentIDFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
return
}
if s.normalExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, NormalExtDeletedFileName), os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o666); err != nil {
return
}
} else {
if err = MkdirAll(dataDir); err != nil {
return nil, fmt.Errorf("NewExtentStore [%v] err[%v]", dataDir, err)
}
if s.tinyExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, TinyExtDeletedFileName), os.O_RDWR|os.O_APPEND, 0o666); err != nil {
return
}
if s.verifyExtentFp, err = os.OpenFile(path.Join(s.dataPath, ExtCrcHeaderFileName), os.O_RDWR, 0o666); err != nil {
return
}
if s.metadataFp, err = os.OpenFile(path.Join(s.dataPath, ExtBaseExtentIDFileName), os.O_RDWR, 0o666); err != nil {
return
}
if s.normalExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, NormalExtDeletedFileName), os.O_RDWR|os.O_APPEND, 0o666); err != nil {
return
}
}
stat, err := s.tinyExtentDeleteFp.Stat()
if err != nil {
return
}
if stat.Size()%DeleteTinyRecordSize != 0 {
needWriteEmpty := DeleteTinyRecordSize - (stat.Size() % DeleteTinyRecordSize)
data := make([]byte, needWriteEmpty)
s.tinyExtentDeleteFp.Write(data)
}
log.LogDebugf("NewExtentStore.partitionID [%v] dataPath %v verifyExtentFp init", partitionID, s.dataPath)
if s.verifyExtentFp, err = os.OpenFile(path.Join(s.dataPath, ExtCrcHeaderFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
return
}
aId := 0
var vFp *os.File
for {
dataPath := path.Join(s.dataPath, ExtCrcHeaderFileName+"_"+strconv.Itoa(aId))
if _, err = os.Stat(dataPath); err != nil {
log.LogDebugf("NewExtentStore. partitionID [%v] dataPath not exist err %v. verifyExtentFpAppend init return", partitionID, err)
break
}
if vFp, err = os.OpenFile(dataPath, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
log.LogErrorf("NewExtentStore. partitionID [%v] dataPath exist but open err %v. verifyExtentFpAppend init return", partitionID, err)
return
}
log.LogDebugf("NewExtentStore. partitionID [%v] dataPath exist and opened id %v", partitionID, aId)
s.verifyExtentFpAppend = append(s.verifyExtentFpAppend, vFp)
aId++
}
if s.metadataFp, err = os.OpenFile(path.Join(s.dataPath, ExtBaseExtentIDFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
return
}
if s.normalExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, NormalExtDeletedFileName), os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o666); err != nil {
return
}
s.extentInfoMap = make(map[uint64]*ExtentInfo)
s.cache = NewExtentCache(100)
if err = s.initBaseFileID(); err != nil {
err = fmt.Errorf("init base field ID: %v", err)
return
}
s.hasAllocSpaceExtentIDOnVerfiyFile = s.GetPreAllocSpaceExtentIDOnVerifyFile()
s.storeSize = storeSize
s.closeC = make(chan bool, 1)
s.closed = false
err = s.initTinyExtent()
if err != nil {
return
}
return
}
func (ei *ExtentInfo) UpdateExtentInfo(extent *Extent, crc uint32) {
extent.Lock()
defer extent.Unlock()
if time.Now().Unix()-extent.ModifyTime() <= UpdateCrcInterval {
crc = 0
}
ei.Size = uint64(extent.dataSize)
ei.SnapshotDataOff = extent.snapshotDataOff
log.LogInfof("action[ExtentInfo.UpdateExtentInfo] ei info [%v]", ei.String())
if !IsTinyExtent(ei.FileID) {
atomic.StoreUint32(&ei.Crc, crc)
ei.ModifyTime = extent.ModifyTime()
}
}
// SnapShot returns the information of all the extents on the current data partition.
// When the master sends the loadDataPartition request, the snapshot is used to compare the replicas.
func (s *ExtentStore) SnapShot() (files []*proto.File, err error) {
var normalExtentSnapshot, tinyExtentSnapshot []*ExtentInfo
// compute crc again to guarantee crc and applyID is the newest
s.autoComputeExtentCrc()
if normalExtentSnapshot, _, err = s.GetAllWatermarks(NormalExtentFilter()); err != nil {
log.LogErrorf("SnapShot GetAllWatermarks err %v", err)
return
}
files = make([]*proto.File, 0, len(normalExtentSnapshot))
for _, ei := range normalExtentSnapshot {
file := GetSnapShotFileFromPool()
file.Name = strconv.FormatUint(ei.FileID, 10)
file.Size = uint32(ei.Size)
file.Modified = ei.ModifyTime
file.Crc = atomic.LoadUint32(&ei.Crc)
file.ApplyID = ei.ApplyID
log.LogDebugf("partitionID %v ExtentStore set applyid %v partition %v", s.partitionID, s.ApplyId, s.partitionID)
files = append(files, file)
}
tinyExtentSnapshot = s.getTinyExtentInfo()
for _, ei := range tinyExtentSnapshot {
file := GetSnapShotFileFromPool()
file.Name = strconv.FormatUint(ei.FileID, 10)
file.Size = uint32(ei.Size)
file.Modified = ei.ModifyTime
file.Crc = 0
files = append(files, file)
}
return
}
// Create creates an extent.
func (s *ExtentStore) Create(extentID uint64) (err error) {
var e *Extent
name := path.Join(s.dataPath, strconv.Itoa(int(extentID)))
if s.HasExtent(extentID) {
err = ExtentExistsError
return err
}
e = NewExtentInCore(name, extentID)
e.header = make([]byte, util.BlockHeaderSize)
err = e.InitToFS()
if err != nil {
return err
}
s.cache.Put(e)
extInfo := &ExtentInfo{FileID: extentID}
extInfo.UpdateExtentInfo(e, 0)
atomic.StoreInt64(&extInfo.AccessTime, e.accessTime)
s.eiMutex.Lock()
s.extentInfoMap[extentID] = extInfo
s.eiMutex.Unlock()
s.UpdateBaseExtentID(extentID)
return
}
func (s *ExtentStore) initBaseFileID() error {
var baseFileID uint64
baseFileID, _ = s.GetPersistenceBaseExtentID()
files, err := os.ReadDir(s.dataPath)
if err != nil {
return err
}
var (
extentID uint64
isExtent bool
e *Extent
ei *ExtentInfo
loadErr error
)
for _, f := range files {
if extentID, isExtent = s.ExtentID(f.Name()); !isExtent {
continue
}
if e, loadErr = s.extent(extentID); loadErr != nil {
log.LogError("[initBaseFileID] load extent error", loadErr)
continue
}
ei = &ExtentInfo{FileID: extentID}
ei.UpdateExtentInfo(e, 0)
atomic.StoreInt64(&ei.AccessTime, e.accessTime)
s.eiMutex.Lock()
s.extentInfoMap[extentID] = ei
s.eiMutex.Unlock()
e.Close()
if !IsTinyExtent(extentID) && extentID > baseFileID {
baseFileID = extentID
}
}
if baseFileID < MinExtentID {
baseFileID = MinExtentID
}
atomic.StoreUint64(&s.baseExtentID, baseFileID)
log.LogInfof("datadir(%v) maxBaseId(%v)", s.dataPath, baseFileID)
runtime.GC()
return nil
}
// Write writes the given extent to the disk.
func (s *ExtentStore) Write(extentID uint64, offset, size int64, data []byte, crc uint32, writeType int, isSync bool) (status uint8, err error) {
var (
e *Extent
ei *ExtentInfo
)
status = proto.OpOk
s.eiMutex.Lock()
ei = s.extentInfoMap[extentID]
e, err = s.extentWithHeader(ei)
s.eiMutex.Unlock()
if err != nil {
return status, err
}
// update access time
atomic.StoreInt64(&ei.AccessTime, time.Now().Unix())
log.LogDebugf("action[Write] dp %v extentID %v offset %v size %v writeTYPE %v", s.partitionID, extentID, offset, size, writeType)
if err = s.checkOffsetAndSize(extentID, offset, size, writeType); err != nil {
log.LogInfof("action[Write] path %v err %v", e.filePath, err)
return status, err
}
status, err = e.Write(data, offset, size, crc, writeType, isSync, s.PersistenceBlockCrc, ei)
if err != nil {
log.LogInfof("action[Write] path %v err %v", e.filePath, err)
return status, err
}
ei.UpdateExtentInfo(e, 0)
return status, nil
}
func (s *ExtentStore) checkOffsetAndSize(extentID uint64, offset, size int64, writeType int) error {
if IsTinyExtent(extentID) {
return nil
}
// random write pos can happen on modAppend partition of extent
if writeType == RandomWriteType {
return nil
}
if writeType == AppendRandomWriteType {
if offset < util.ExtentSize {
return newParameterError("writeType=%d offset=%d size=%d", writeType, offset, size)
}
return nil
}
if size == 0 || size > util.BlockSize ||
offset >= util.BlockCount*util.BlockSize ||
offset+size > util.BlockCount*util.BlockSize {
return newParameterError("offset=%d size=%d", offset, size)
}
return nil
}
// IsTinyExtent checks if the given extent is tiny extent.
func IsTinyExtent(extentID uint64) bool {
return extentID >= TinyExtentStartID && extentID < TinyExtentStartID+TinyExtentCount
}
// Read reads the extent based on the given id.
func (s *ExtentStore) Read(extentID uint64, offset, size int64, nbuf []byte, isRepairRead bool) (crc uint32, err error) {
var e *Extent
s.eiMutex.RLock()
ei := s.extentInfoMap[extentID]
s.eiMutex.RUnlock()
if ei == nil {
return 0, errors.Trace(ExtentHasBeenDeletedError, "[Read] extent[%d] is already been deleted", extentID)
}
// update extent access time
atomic.StoreInt64(&ei.AccessTime, time.Now().Unix())
if e, err = s.extentWithHeader(ei); err != nil {
return
}
//if err = s.checkOffsetAndSize(extentID, offset, size); err != nil {
// return
//}
crc, err = e.Read(nbuf, offset, size, isRepairRead)
return
}
func (s *ExtentStore) DumpExtents() (extInfos SortedExtentInfos) {
s.eiMutex.RLock()
for _, v := range s.extentInfoMap {
extInfos = append(extInfos, v)
}
s.eiMutex.RUnlock()
return
}
func (s *ExtentStore) punchDelete(extentID uint64, offset, size int64) (err error) {
e, err := s.extentWithHeaderByExtentID(extentID)
if err != nil {
return nil
}
if offset+size > e.dataSize {
return
}
var hasDelete bool
if hasDelete, err = e.punchDelete(offset, size); err != nil {
return
}
if hasDelete {
return
}
if err = s.RecordTinyDelete(e.extentID, offset, size); err != nil {
return
}
return
}
// MarkDelete marks the given extent as deleted.
func (s *ExtentStore) MarkDelete(extentID uint64, offset, size int64) (err error) {
var ei *ExtentInfo
s.eiMutex.RLock()
ei = s.extentInfoMap[extentID]
s.eiMutex.RUnlock()
if ei == nil || ei.IsDeleted {
return
}
log.LogDebugf("action[MarkDelete] extentID %v offset %v size %v ei(size %v snapshotSize %v)",
extentID, offset, size, ei.Size, ei.SnapshotDataOff)
funcNeedPunchDel := func() bool {
return offset != 0 || (size != 0 && ((ei.Size != uint64(size) && ei.SnapshotDataOff == util.ExtentSize) ||
(ei.SnapshotDataOff != uint64(size) && ei.SnapshotDataOff > util.ExtentSize)))
}
if IsTinyExtent(extentID) || funcNeedPunchDel() {
log.LogDebugf("action[MarkDelete] extentID %v offset %v size %v ei(size %v snapshotSize %v)",
extentID, offset, size, ei.Size, ei.SnapshotDataOff)
return s.punchDelete(extentID, offset, size)
}
extentFilePath := path.Join(s.dataPath, strconv.FormatUint(extentID, 10))
log.LogDebugf("action[MarkDelete] extentID %v offset %v size %v ei(size %v extentFilePath %v)",
extentID, offset, size, ei.Size, extentFilePath)
if err = os.Remove(extentFilePath); err != nil && !os.IsNotExist(err) {
// NOTE: if remove failed
// we meet a disk error
err = BrokenDiskError
return
}
if err = s.PersistenceHasDeleteExtent(extentID); err != nil {
err = BrokenDiskError
return
}
ei.IsDeleted = true
ei.ModifyTime = time.Now().Unix()
s.cache.Del(extentID)
if err = s.DeleteBlockCrc(extentID); err != nil {
err = BrokenDiskError
return
}
s.PutNormalExtentToDeleteCache(extentID)
s.eiMutex.Lock()
delete(s.extentInfoMap, extentID)
s.eiMutex.Unlock()
return
}
func (s *ExtentStore) PutNormalExtentToDeleteCache(extentID uint64) {
s.hasDeleteNormalExtentsCache.Store(extentID, time.Now().Unix())
}
func (s *ExtentStore) IsDeletedNormalExtent(extentID uint64) (ok bool) {
_, ok = s.hasDeleteNormalExtentsCache.Load(extentID)
return
}
// Close closes the extent store.
func (s *ExtentStore) Close() {
s.mutex.Lock()
defer s.mutex.Unlock()
if s.closed {
return
}
// Release cache
s.cache.Flush()
s.cache.Clear()
s.tinyExtentDeleteFp.Sync()
s.tinyExtentDeleteFp.Close()
s.normalExtentDeleteFp.Sync()
s.normalExtentDeleteFp.Close()
s.verifyExtentFp.Sync()
s.verifyExtentFp.Close()
for _, vFp := range s.verifyExtentFpAppend {
if vFp != nil {
vFp.Sync()
vFp.Close()
}
}
s.closed = true
}
// Watermark returns the extent info of the given extent on the record.
func (s *ExtentStore) Watermark(extentID uint64) (ei *ExtentInfo, err error) {
var has bool
s.eiMutex.RLock()
ei, has = s.extentInfoMap[extentID]
s.eiMutex.RUnlock()
if !has {
err = fmt.Errorf("e %v not exist", s.getExtentKey(extentID))
return
}
return
}
// GetTinyExtentOffset returns the offset of the given extent.
func (s *ExtentStore) GetTinyExtentOffset(extentID uint64) (watermark int64, err error) {
einfo, err := s.Watermark(extentID)
if err != nil {
return
}
watermark = int64(einfo.Size)
if watermark%util.PageSize != 0 {
watermark = watermark + (util.PageSize - watermark%util.PageSize)
}
return
}
// GetTinyExtentOffset returns the offset of the given extent.
func (s *ExtentStore) GetExtentSnapshotModOffset(extentID uint64, allocSize uint32) (watermark int64, err error) {
einfo, err := s.Watermark(extentID)
if err != nil {
return
}
log.LogDebugf("action[ExtentStore.GetExtentSnapshotModOffset] extId %v SnapshotDataOff %v SnapPreAllocDataOff %v allocSize %v",
extentID, einfo.SnapshotDataOff, einfo.SnapPreAllocDataOff, allocSize)
if einfo.SnapPreAllocDataOff == 0 {
einfo.SnapPreAllocDataOff = einfo.SnapshotDataOff
}
watermark = int64(einfo.SnapPreAllocDataOff)
//if watermark%util.PageSize != 0 {
// watermark = watermark + (util.PageSize - watermark%util.PageSize)
//}
einfo.SnapPreAllocDataOff += uint64(allocSize)
return
}
// Sector size
const (
DiskSectorSize = 512
)
func (s *ExtentStore) GetStoreUsedSize() (used int64) {
extentInfoSlice := make([]*ExtentInfo, 0, s.GetExtentCount())
s.eiMutex.RLock()
for _, extentID := range s.extentInfoMap {
extentInfoSlice = append(extentInfoSlice, extentID)
}
s.eiMutex.RUnlock()
for _, einfo := range extentInfoSlice {
if einfo.IsDeleted {
continue
}
if IsTinyExtent(einfo.FileID) {
stat := new(syscall.Stat_t)
err := syscall.Stat(fmt.Sprintf("%v/%v", s.dataPath, einfo.FileID), stat)
if err != nil {
continue
}
used += stat.Blocks * DiskSectorSize
} else {
used += int64(einfo.Size + (einfo.SnapshotDataOff - util.ExtentSize))
}
}
return
}
// GetAllWatermarks returns all the watermarks.
func (s *ExtentStore) GetAllWatermarks(filter ExtentFilter) (extents []*ExtentInfo, tinyDeleteFileSize int64, err error) {
extents = make([]*ExtentInfo, 0, len(s.extentInfoMap))
extentInfoSlice := make([]*ExtentInfo, 0, len(s.extentInfoMap))
s.eiMutex.RLock()
for _, extentID := range s.extentInfoMap {
extentInfoSlice = append(extentInfoSlice, extentID)
}
s.eiMutex.RUnlock()
for _, extentInfo := range extentInfoSlice {
if filter != nil && !filter(extentInfo) {
continue
}
if extentInfo.IsDeleted {
continue
}
extents = append(extents, extentInfo)
}
tinyDeleteFileSize, err = s.LoadTinyDeleteFileOffset()
return
}
func (s *ExtentStore) getTinyExtentInfo() (extents []*ExtentInfo) {
extents = make([]*ExtentInfo, 0)
s.eiMutex.RLock()
var extentID uint64
for extentID = TinyExtentStartID; extentID < TinyExtentCount+TinyExtentStartID; extentID++ {
ei := s.extentInfoMap[extentID]
if ei == nil {
continue
}
extents = append(extents, ei)
}
s.eiMutex.RUnlock()
return
}
// ExtentID return the extent ID.
func (s *ExtentStore) ExtentID(filename string) (extentID uint64, isExtent bool) {
if isExtent = RegexpExtentFile.MatchString(filename); !isExtent {
return
}
var err error
if extentID, err = strconv.ParseUint(filename, 10, 64); err != nil {
isExtent = false
return
}
isExtent = true
return
}
func (s *ExtentStore) initTinyExtent() (err error) {
s.availableTinyExtentC = make(chan uint64, TinyExtentCount)
s.brokenTinyExtentC = make(chan uint64, TinyExtentCount)
var extentID uint64
for extentID = TinyExtentStartID; extentID < TinyExtentStartID+TinyExtentCount; extentID++ {
err = s.Create(extentID)
if err == nil || strings.Contains(err.Error(), syscall.EEXIST.Error()) || err == ExtentExistsError {
err = nil
s.brokenTinyExtentC <- extentID
s.brokenTinyExtentMap.Store(extentID, true)
continue
}
return err
}
return
}
// GetAvailableTinyExtent returns the available tiny extent from the channel.
func (s *ExtentStore) GetAvailableTinyExtent() (extentID uint64, err error) {
select {
case extentID = <-s.availableTinyExtentC:
log.LogDebugf("dp %v GetAvailableTinyExtent. extentID %v", s.partitionID, extentID)
s.availableTinyExtentMap.Delete(extentID)
return
default:
log.LogDebugf("dp %v GetAvailableTinyExtent not found", s.partitionID)
return 0, NoAvailableExtentError
}
}
// SendToAvailableTinyExtentC sends the extent to the channel that stores the available tiny extents.
func (s *ExtentStore) SendToAvailableTinyExtentC(extentID uint64) {
log.LogDebugf("dp %v action[SendToAvailableTinyExtentC] extentid %v", s.partitionID, extentID)
if _, ok := s.availableTinyExtentMap.Load(extentID); !ok {
log.LogDebugf("dp %v SendToAvailableTinyExtentC. extentID %v", s.partitionID, extentID)
s.availableTinyExtentC <- extentID
s.availableTinyExtentMap.Store(extentID, true)
} else {
log.LogDebugf("dp %v action[SendToAvailableTinyExtentC] extentid %v already exist", s.partitionID, extentID)
}
}
// SendAllToBrokenTinyExtentC sends all the extents to the channel that stores the broken extents.
func (s *ExtentStore) SendAllToBrokenTinyExtentC(extentIds []uint64) {
for _, extentID := range extentIds {
if _, ok := s.brokenTinyExtentMap.Load(extentID); !ok {
s.brokenTinyExtentC <- extentID
s.brokenTinyExtentMap.Store(extentID, true)
}
}
}
// AvailableTinyExtentCnt returns the count of the available tiny extents.
func (s *ExtentStore) AvailableTinyExtentCnt() int {
return len(s.availableTinyExtentC)
}
// BrokenTinyExtentCnt returns the count of the broken tiny extents.
func (s *ExtentStore) BrokenTinyExtentCnt() int {
return len(s.brokenTinyExtentC)
}
// MoveAllToBrokenTinyExtentC moves all the tiny extents to the channel stores the broken extents.
func (s *ExtentStore) MoveAllToBrokenTinyExtentC(cnt int) {
for i := 0; i < cnt; i++ {
extentID, err := s.GetAvailableTinyExtent()
if err != nil {
return
}
s.SendToBrokenTinyExtentC(extentID)
}
}
// SendToBrokenTinyExtentC sends the given extent id to the channel.
func (s *ExtentStore) SendToBrokenTinyExtentC(extentID uint64) {
if _, ok := s.brokenTinyExtentMap.Load(extentID); !ok {
s.brokenTinyExtentC <- extentID
s.brokenTinyExtentMap.Store(extentID, true)
}
}
// GetBrokenTinyExtent returns the first broken extent in the channel.
func (s *ExtentStore) GetBrokenTinyExtent() (extentID uint64, err error) {
select {
case extentID = <-s.brokenTinyExtentC:
s.brokenTinyExtentMap.Delete(extentID)
return
default:
return 0, NoBrokenExtentError
}
}
// StoreSizeExtentID returns the size of the extent store
func (s *ExtentStore) StoreSizeExtentID(maxExtentID uint64) (totalSize uint64) {
extentInfos := make([]*ExtentInfo, 0)
s.eiMutex.RLock()
for _, extentInfo := range s.extentInfoMap {
if extentInfo.FileID <= maxExtentID {
extentInfos = append(extentInfos, extentInfo)
}
}
s.eiMutex.RUnlock()
for _, extentInfo := range extentInfos {
totalSize += extentInfo.TotalSize()
log.LogDebugf("ExtentStore.StoreSizeExtentID dp %v extentInfo %v totalSize %v", s.partitionID, extentInfo, extentInfo.TotalSize())
}
return totalSize
}
// StoreSizeExtentID returns the size of the extent store
func (s *ExtentStore) GetMaxExtentIDAndPartitionSize() (maxExtentID, totalSize uint64) {
extentInfos := make([]*ExtentInfo, 0)
s.eiMutex.RLock()
for _, extentInfo := range s.extentInfoMap {
extentInfos = append(extentInfos, extentInfo)
}
s.eiMutex.RUnlock()
for _, extentInfo := range extentInfos {
if extentInfo.FileID > maxExtentID {
maxExtentID = extentInfo.FileID
}
totalSize += extentInfo.TotalSize()
}
return maxExtentID, totalSize
}
func MarshalTinyExtent(extentID uint64, offset, size int64) (data []byte) {
data = make([]byte, DeleteTinyRecordSize)
binary.BigEndian.PutUint64(data[0:8], extentID)
binary.BigEndian.PutUint64(data[8:16], uint64(offset))
binary.BigEndian.PutUint64(data[16:DeleteTinyRecordSize], uint64(size))
return data
}
func UnMarshalTinyExtent(data []byte) (extentID, offset, size uint64) {
extentID = binary.BigEndian.Uint64(data[0:8])
offset = binary.BigEndian.Uint64(data[8:16])
size = binary.BigEndian.Uint64(data[16:DeleteTinyRecordSize])
return
}
func (s *ExtentStore) RecordTinyDelete(extentID uint64, offset, size int64) (err error) {
record := MarshalTinyExtent(extentID, offset, size)
stat, err := s.tinyExtentDeleteFp.Stat()
if err != nil {
return
}
if stat.Size()%DeleteTinyRecordSize != 0 {
needWriteEmpty := DeleteTinyRecordSize - (stat.Size() % DeleteTinyRecordSize)
data := make([]byte, needWriteEmpty)
s.tinyExtentDeleteFp.Write(data)
}
_, err = s.tinyExtentDeleteFp.Write(record)
if err != nil {
return
}
return
}
func (s *ExtentStore) ReadTinyDeleteRecords(offset, size int64, data []byte) (crc uint32, err error) {
_, err = s.tinyExtentDeleteFp.ReadAt(data[:size], offset)
if err == nil || err == io.EOF {
err = nil
crc = crc32.ChecksumIEEE(data[:size])
}
return
}
type ExtentDeleted struct {
ExtentID uint64 `json:"extentID"`
Offset uint64 `json:"offset"`
Size uint64 `json:"size"`
}
func (s *ExtentStore) GetHasDeleteTinyRecords() (extentDes []ExtentDeleted, err error) {
data := make([]byte, DeleteTinyRecordSize)
offset := int64(0)
for {
_, err = s.tinyExtentDeleteFp.ReadAt(data, offset)
if err != nil {
if err == io.EOF {
err = nil
}
return
}
extent := ExtentDeleted{}
extent.ExtentID, extent.Offset, extent.Size = UnMarshalTinyExtent(data)
extentDes = append(extentDes, extent)
offset += DeleteTinyRecordSize
}
}
// NextExtentID returns the next extentID. When the client sends the request to create an extent,
// this function generates an unique extentID within the current partition.
// This function can only be called by the leader.
func (s *ExtentStore) NextExtentID() (extentID uint64, err error) {
extentID = atomic.AddUint64(&s.baseExtentID, 1)
err = s.PersistenceBaseExtentID(extentID)
return
}
func (s *ExtentStore) LoadTinyDeleteFileOffset() (offset int64, err error) {
stat, err := s.tinyExtentDeleteFp.Stat()
if err == nil {
offset = stat.Size()
}
return
}
func (s *ExtentStore) getExtentKey(extent uint64) string {
return fmt.Sprintf("extent %v_%v", s.partitionID, extent)
}
// UpdateBaseExtentID updates the base extent ID.
func (s *ExtentStore) UpdateBaseExtentID(id uint64) (err error) {
if IsTinyExtent(id) {
return
}
if id > atomic.LoadUint64(&s.baseExtentID) {
atomic.StoreUint64(&s.baseExtentID, id)
err = s.PersistenceBaseExtentID(atomic.LoadUint64(&s.baseExtentID))
}
s.PreAllocSpaceOnVerfiyFile(atomic.LoadUint64(&s.baseExtentID))
return
}
func (s *ExtentStore) extent(extentID uint64) (e *Extent, err error) {
if e, err = s.LoadExtentFromDisk(extentID, false); err != nil {
err = fmt.Errorf("load extent from disk: %v", err)
return nil, err
}
return
}
func (s *ExtentStore) extentWithHeader(ei *ExtentInfo) (e *Extent, err error) {
var ok bool
if ei == nil || ei.IsDeleted {
err = ExtentNotFoundError
return
}
if e, ok = s.cache.Get(ei.FileID); !ok {
if e, err = s.LoadExtentFromDisk(ei.FileID, true); err != nil {
err = fmt.Errorf("load %v from disk: %v", s.getExtentKey(ei.FileID), err)
return nil, err
}
}
return
}
func (s *ExtentStore) extentWithHeaderByExtentID(extentID uint64) (e *Extent, err error) {
var ok bool
if e, ok = s.cache.Get(extentID); !ok {
if e, err = s.LoadExtentFromDisk(extentID, true); err != nil {
err = fmt.Errorf("load %v from disk: %v", s.getExtentKey(extentID), err)
return nil, err
}
}
return
}
// HasExtent tells if the extent store has the extent with the given ID
func (s *ExtentStore) HasExtent(extentID uint64) (exist bool) {
s.eiMutex.RLock()
defer s.eiMutex.RUnlock()
_, exist = s.extentInfoMap[extentID]
return
}
// GetExtentCount returns the number of extents in the extentInfoMap
func (s *ExtentStore) GetExtentCount() (count int) {
s.eiMutex.RLock()
defer s.eiMutex.RUnlock()
return len(s.extentInfoMap)
}
func (s *ExtentStore) LoadExtentFromDisk(extentID uint64, putCache bool) (e *Extent, err error) {
name := path.Join(s.dataPath, fmt.Sprintf("%v", extentID))
e = NewExtentInCore(name, extentID)
if err = e.RestoreFromFS(); err != nil {
err = fmt.Errorf("restore from file %v putCache %v system: %v", name, putCache, err)
return
}
if !putCache {
return
}
if !IsTinyExtent(extentID) && proto.IsNormalDp(s.partitionType) {
e.header = make([]byte, util.BlockHeaderSize)
if _, err = s.verifyExtentFp.ReadAt(e.header, int64(extentID*util.BlockHeaderSize)); err != nil && err != io.EOF {
return
}
emptyHeader := make([]byte, util.BlockHeaderSize)
log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, snapshotOff %v, append fp cnt %v",
s.partitionID, extentID, e.snapshotDataOff, len(s.verifyExtentFpAppend))
if e.snapshotDataOff > util.ExtentSize {
for id, vFp := range s.verifyExtentFpAppend {
if uint64(id) > (e.snapshotDataOff-util.ExtentSize)/util.ExtentSize {
log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, snapshotOff %v id %v out of extent range",
s.partitionID, extentID, e.snapshotDataOff, id)
break
}
log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, snapshotOff %v id %v", s.partitionID, extentID, e.snapshotDataOff, id)
header := make([]byte, util.BlockHeaderSize)
if _, err = vFp.ReadAt(header, int64(extentID*util.BlockHeaderSize)); err != nil && err != io.EOF {
log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, read at %v err %v",
s.partitionID, extentID, extentID*util.BlockHeaderSize, err)
return
}
if bytes.Equal(emptyHeader, header) {
log.LogErrorf("LoadExtentFromDisk. partition id %v extent %v hole at id %v", s.partitionID, e, id)
}
e.header = append(e.header, header...)
}
if len(s.verifyExtentFpAppend) < int(e.snapshotDataOff-1)/util.ExtentSize {
log.LogErrorf("LoadExtentFromDisk. extent %v need fp %v out of range %v", e, int(e.snapshotDataOff-1)/util.ExtentSize, len(s.verifyExtentFpAppend))
}
}
}
err = nil
s.cache.Put(e)
return
}
func (s *ExtentStore) ScanBlocks(extentID uint64) (bcs []*BlockCrc, err error) {
if !proto.IsNormalDp(s.partitionType) {
return
}
var blockCnt int
bcs = make([]*BlockCrc, 0)
ei := s.extentInfoMap[extentID]
e, err := s.extentWithHeader(ei)
if err != nil {
return bcs, err
}
extSize := e.Size()
if e.snapshotDataOff > util.ExtentSize {
extSize = int64(e.snapshotDataOff)
}
blockCnt = int(extSize / util.BlockSize)
if e.Size()%util.BlockSize != 0 {
blockCnt += 1
}
for blockNo := 0; blockNo < blockCnt; blockNo++ {
blockCrc := binary.BigEndian.Uint32(e.header[blockNo*util.PerBlockCrcSize : (blockNo+1)*util.PerBlockCrcSize])
bcs = append(bcs, &BlockCrc{BlockNo: blockNo, Crc: blockCrc})
}
sort.Sort(BlockCrcArr(bcs))
return
}
type ExtentInfoArr []*ExtentInfo
func (arr ExtentInfoArr) Len() int { return len(arr) }
func (arr ExtentInfoArr) Less(i, j int) bool { return arr[i].FileID < arr[j].FileID }
func (arr ExtentInfoArr) Swap(i, j int) { arr[i], arr[j] = arr[j], arr[i] }
func (s *ExtentStore) BackendTask() {
s.autoComputeExtentCrc()
s.cleanExpiredNormalExtentDeleteCache()
}
func (s *ExtentStore) cleanExpiredNormalExtentDeleteCache() {
s.hasDeleteNormalExtentsCache.Range(func(key, value interface{}) bool {
deleteTime := value.(int64)
extentID := key.(uint64)
if time.Now().Unix()-deleteTime > NormalExtentDeleteRetainTime {
s.hasDeleteNormalExtentsCache.Delete(extentID)
}
return true
})
}
func (s *ExtentStore) autoComputeExtentCrc() {
if !proto.IsNormalDp(s.partitionType) {
return
}
defer func() {
if r := recover(); r != nil {
return
}
}()
extentInfos := make([]*ExtentInfo, 0)
deleteExtents := make([]*ExtentInfo, 0)
s.eiMutex.RLock()
for _, ei := range s.extentInfoMap {
extentInfos = append(extentInfos, ei)
if ei.IsDeleted && time.Now().Unix()-ei.ModifyTime > UpdateCrcInterval {
deleteExtents = append(deleteExtents, ei)
}
}
s.eiMutex.RUnlock()
if len(deleteExtents) > 0 {
s.eiMutex.Lock()
for _, ei := range deleteExtents {
delete(s.extentInfoMap, ei.FileID)
}
s.eiMutex.Unlock()
}
sort.Sort(ExtentInfoArr(extentInfos))
for _, ei := range extentInfos {
s.ApplyIdMutex.RLock()
if ei == nil {
s.ApplyIdMutex.RUnlock()
continue
}
if !IsTinyExtent(ei.FileID) && time.Now().Unix()-ei.ModifyTime > UpdateCrcInterval &&
!ei.IsDeleted && ei.Size > 0 && ei.Crc == 0 {
e, err := s.extentWithHeader(ei)
if err != nil {
log.LogError("[autoComputeExtentCrc] get extent error", err)
s.ApplyIdMutex.RUnlock()
continue
}
extentCrc, err := e.autoComputeExtentCrc(s.PersistenceBlockCrc)
if err != nil {
log.LogError("[autoComputeExtentCrc] compute crc fail", err)
s.ApplyIdMutex.RUnlock()
continue
}
ei.UpdateExtentInfo(e, extentCrc)
ei.ApplyID = s.ApplyId
time.Sleep(time.Millisecond * 100)
}
s.ApplyIdMutex.RUnlock()
}
time.Sleep(time.Second)
}
func (s *ExtentStore) TinyExtentRecover(extentID uint64, offset, size int64, data []byte, crc uint32, isEmptyPacket bool) (err error) {
if !IsTinyExtent(extentID) {
return fmt.Errorf("extent %v not tinyExtent", extentID)
}
var (
e *Extent
ei *ExtentInfo
)
s.eiMutex.RLock()
ei = s.extentInfoMap[extentID]
s.eiMutex.RUnlock()
if e, err = s.extentWithHeader(ei); err != nil {
return nil
}
if err = e.TinyExtentRecover(data, offset, size, crc, isEmptyPacket); err != nil {
return err
}
ei.UpdateExtentInfo(e, 0)
return nil
}
func (s *ExtentStore) TinyExtentGetFinfoSize(extentID uint64) (size uint64, err error) {
var e *Extent
if !IsTinyExtent(extentID) {
return 0, fmt.Errorf("unavali extent id (%v)", extentID)
}
s.eiMutex.RLock()
ei := s.extentInfoMap[extentID]
s.eiMutex.RUnlock()
if e, err = s.extentWithHeader(ei); err != nil {
return
}
finfo, err := e.file.Stat()
if err != nil {
return 0, err
}
size = uint64(finfo.Size())
return
}
func (s *ExtentStore) TinyExtentAvaliOffset(extentID uint64, offset int64) (newOffset, newEnd int64, err error) {
var e *Extent
if !IsTinyExtent(extentID) {
return 0, 0, fmt.Errorf("unavali extent(%v)", extentID)
}
s.eiMutex.RLock()
ei := s.extentInfoMap[extentID]
s.eiMutex.RUnlock()
if e, err = s.extentWithHeader(ei); err != nil {
return
}
defer func() {
if err != nil && strings.Contains(err.Error(), syscall.ENXIO.Error()) {
newOffset = e.dataSize
newEnd = e.dataSize
err = nil
}
}()
newOffset, newEnd, err = e.tinyExtentAvaliOffset(offset)
return
}
func (s *ExtentStore) renameStaleExtentStore() (err error) {
// create: move current folder to .old and create a new folder
if _, err = os.Stat(s.dataPath); err != nil {
if os.IsNotExist(err) {
return nil
}
}
curTime := time.Now().Format(StaleExtStoreTimeFormat)
staleExtStoreDirName := s.dataPath + "_" + curTime + StaleExtStoreBackupSuffix
if err = os.Rename(s.dataPath, staleExtStoreDirName); err != nil {
return
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"syscall"
"github.com/cubefs/cubefs/util"
)
func fallocate(fd int, mode uint32, off int64, len int64) (err error) {
var tryCnt int
for {
err = syscall.Fallocate(fd, mode, off, len)
if err == syscall.EINTR {
tryCnt++
if tryCnt >= util.SyscallTryMaxTimes {
return
}
continue
}
return
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package storage
import (
"encoding/binary"
"io"
"os"
"path"
"strconv"
"sync/atomic"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
)
type BlockCrc struct {
BlockNo int
Crc uint32
}
type BlockCrcArr []*BlockCrc
const (
BaseExtentIDOffset = 0
)
func (arr BlockCrcArr) Len() int { return len(arr) }
func (arr BlockCrcArr) Less(i, j int) bool { return arr[i].BlockNo < arr[j].BlockNo }
func (arr BlockCrcArr) Swap(i, j int) { arr[i], arr[j] = arr[j], arr[i] }
type (
UpdateCrcFunc func(e *Extent, blockNo int, crc uint32) (err error)
GetExtentCrcFunc func(extentID uint64) (crc uint32, err error)
)
func (s *ExtentStore) BuildSnapshotExtentCrcMetaFile(blockNo int) (fp *os.File, err error) {
fIdx := blockNo * util.PerBlockCrcSize / util.BlockHeaderSize
if fIdx > 0 {
gap := fIdx - len(s.verifyExtentFpAppend)
log.LogDebugf("PersistenceBlockCrc. idx %v gap %v", fIdx, gap)
if gap > 0 {
appendFpArr := make([]*os.File, fIdx-len(s.verifyExtentFpAppend))
s.verifyExtentFpAppend = append(s.verifyExtentFpAppend, appendFpArr...)
for i := gap; i > 0; i-- {
suffix := fIdx - i
dataPath := path.Join(s.dataPath, ExtCrcHeaderFileName+"_"+strconv.Itoa(suffix))
log.LogDebugf("PersistenceBlockCrc. idx %v try create path %v", fIdx-1, dataPath)
if fp, err = os.OpenFile(dataPath, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
log.LogDebugf("PersistenceBlockCrc. idx %v try create path %v err %v", fIdx, dataPath, err)
return
}
log.LogDebugf("PersistenceBlockCrc. idx %v try create path %v success", fIdx, dataPath)
s.verifyExtentFpAppend[suffix] = fp
s.PreAllocSpaceOnVerfiyFileForAppend(suffix)
}
}
if s.verifyExtentFpAppend[fIdx-1] == nil {
dataPath := path.Join(s.dataPath, ExtCrcHeaderFileName+"_"+strconv.Itoa(fIdx-1))
if fp, err = os.OpenFile(dataPath, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
return
}
s.verifyExtentFpAppend[fIdx-1] = fp
}
fp = s.verifyExtentFpAppend[fIdx-1]
}
return
}
func (s *ExtentStore) PersistenceBlockCrc(e *Extent, blockNo int, blockCrc uint32) (err error) {
log.LogDebugf("PersistenceBlockCrc. extent id %v blockNo %v blockCrc %v data path %v", e.extentID, blockNo, blockCrc, s.dataPath)
if !proto.IsNormalDp(s.partitionType) {
return
}
if blockNo >= len(e.header)/util.PerBlockCrcSize {
exp := make([]byte, util.BlockHeaderSize*(1+(blockNo*util.PerBlockCrcSize-len(e.header))/util.BlockHeaderSize))
e.header = append(e.header, exp...)
}
fIdx := blockNo * util.PerBlockCrcSize / util.BlockHeaderSize
log.LogDebugf("PersistenceBlockCrc. idx %v", fIdx)
fp := s.verifyExtentFp
if fIdx > 0 {
if fp, err = s.BuildSnapshotExtentCrcMetaFile(blockNo); err != nil {
return
}
}
startIdx := blockNo * util.PerBlockCrcSize % util.BlockHeaderSize
verifyStart := startIdx + int(util.BlockHeaderSize*e.extentID)
log.LogDebugf("PersistenceBlockCrc. dp %v write at start %v name %v", s.partitionID, startIdx, fp.Name())
headerOff := blockNo*util.PerBlockCrcSize%util.BlockHeaderSize + fIdx*util.BlockHeaderSize
headerEnd := startIdx + util.PerBlockCrcSize%util.BlockHeaderSize + fIdx*util.BlockHeaderSize
binary.BigEndian.PutUint32(e.header[headerOff:headerEnd], blockCrc)
if _, err = fp.WriteAt(e.header[headerOff:headerEnd], int64(verifyStart)); err != nil {
return
}
return
}
func (s *ExtentStore) DeleteBlockCrc(extentID uint64) (err error) {
if !proto.IsNormalDp(s.partitionType) {
return
}
if err = fallocate(int(s.verifyExtentFp.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize,
int64(util.BlockHeaderSize*extentID), util.BlockHeaderSize); err != nil {
return
}
for idx, fp := range s.verifyExtentFpAppend {
if fp == nil {
log.LogErrorf("DeleteBlockCrc. idx %v append fp is nil", idx)
return
}
log.LogDebugf("DeleteBlockCrc. dp %v idx %v extentID %v offset %v", s.partitionID, idx, extentID, int64(util.BlockHeaderSize*extentID))
if err = fallocate(int(fp.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize,
int64(util.BlockHeaderSize*extentID), util.BlockHeaderSize); err != nil {
return
}
}
return
}
func (s *ExtentStore) PersistenceBaseExtentID(extentID uint64) (err error) {
value := make([]byte, 8)
binary.BigEndian.PutUint64(value, extentID)
_, err = s.metadataFp.WriteAt(value, BaseExtentIDOffset)
return
}
func (s *ExtentStore) GetPreAllocSpaceExtentIDOnVerifyFile() (extentID uint64) {
value := make([]byte, 8)
_, err := s.metadataFp.ReadAt(value, 8)
if err != nil {
return
}
extentID = binary.BigEndian.Uint64(value)
return
}
func (s *ExtentStore) PreAllocSpaceOnVerfiyFileForAppend(idx int) {
if !proto.IsNormalDp(s.partitionType) {
return
}
log.LogDebugf("PreAllocSpaceOnVerfiyFileForAppend. idx %v end %v", idx, len(s.verifyExtentFpAppend))
if idx >= len(s.verifyExtentFpAppend) {
log.LogErrorf("PreAllocSpaceOnVerfiyFileForAppend. idx %v end %v", idx, len(s.verifyExtentFpAppend))
return
}
prevAllocSpaceExtentID := int64(atomic.LoadUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile))
log.LogDebugf("PreAllocSpaceOnVerfiyFileForAppend. idx %v size %v", idx, prevAllocSpaceExtentID*util.BlockHeaderSize)
err := fallocate(int(s.verifyExtentFpAppend[idx].Fd()), 1, 0, prevAllocSpaceExtentID*util.BlockHeaderSize)
if err != nil {
log.LogErrorf("PreAllocSpaceOnVerfiyFileForAppend. idx %v size %v err %v", idx, prevAllocSpaceExtentID*util.BlockHeaderSize, err)
return
}
}
func (s *ExtentStore) PreAllocSpaceOnVerfiyFile(currExtentID uint64) {
if !proto.IsNormalDp(s.partitionType) {
return
}
if currExtentID > atomic.LoadUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile) {
prevAllocSpaceExtentID := int64(atomic.LoadUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile))
endAllocSpaceExtentID := int64(prevAllocSpaceExtentID + 1000)
size := int64(1000 * util.BlockHeaderSize)
err := fallocate(int(s.verifyExtentFp.Fd()), 1, prevAllocSpaceExtentID*util.BlockHeaderSize, size)
if err != nil {
return
}
for id, fp := range s.verifyExtentFpAppend {
stat, _ := fp.Stat()
log.LogDebugf("PreAllocSpaceOnVerfiyFile. id %v name %v size %v", id, fp.Name(), stat.Size())
err = fallocate(int(fp.Fd()), 1, prevAllocSpaceExtentID*util.BlockHeaderSize, size)
if err != nil {
log.LogErrorf("PreAllocSpaceOnVerfiyFile. id %v name %v err %v", id, fp.Name(), err)
return
}
}
data := make([]byte, 8)
binary.BigEndian.PutUint64(data, uint64(endAllocSpaceExtentID))
if _, err = s.metadataFp.WriteAt(data, 8); err != nil {
return
}
atomic.StoreUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile, uint64(endAllocSpaceExtentID))
log.LogInfof("Action(PreAllocSpaceOnVerifyFile) PartitionID(%v) currentExtent(%v)"+
"PrevAllocSpaceExtentIDOnVerifyFile(%v) EndAllocSpaceExtentIDOnVerifyFile(%v)"+
" has allocSpaceOnVerifyFile to (%v)", s.partitionID, currExtentID, prevAllocSpaceExtentID, endAllocSpaceExtentID,
prevAllocSpaceExtentID*util.BlockHeaderSize+size)
}
}
func (s *ExtentStore) GetPersistenceBaseExtentID() (extentID uint64, err error) {
data := make([]byte, 8)
_, err = s.metadataFp.ReadAt(data, 0)
if err != nil {
return
}
extentID = binary.BigEndian.Uint64(data)
return
}
func (s *ExtentStore) PersistenceHasDeleteExtent(extentID uint64) (err error) {
data := make([]byte, 8)
binary.BigEndian.PutUint64(data, extentID)
if _, err = s.normalExtentDeleteFp.Write(data); err != nil {
return
}
return
}
func (s *ExtentStore) GetHasDeleteExtent() (extentDes []ExtentDeleted, err error) {
data := make([]byte, 8)
offset := int64(0)
for {
_, err = s.normalExtentDeleteFp.ReadAt(data, offset)
if err != nil {
if err == io.EOF {
err = nil
}
return
}
extent := ExtentDeleted{}
extent.ExtentID = binary.BigEndian.Uint64(data)
extentDes = append(extentDes, extent)
offset += 8
}
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package atomicutil
import (
"math"
"sync/atomic"
)
type Float64 struct {
val uint64
}
func (f *Float64) Load() float64 {
return math.Float64frombits(atomic.LoadUint64(&f.val))
}
func (f *Float64) Store(val float64) {
atomic.StoreUint64(&f.val, math.Float64bits(val))
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package auditlog
import (
"bufio"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"net"
"net/http"
"os"
"path"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/cubefs/cubefs/util/log"
)
const (
Audit_Module = "audit"
FileNameDateFormat = "20060102150405"
ShiftedExtension = ".old"
DefaultAuditLogBufSize = 0
F_OK = 0
DefaultCleanInterval = 1 * time.Hour
DefaultAuditLogSize = 200 * 1024 * 1024 // 200M
DefaultHeadRoom = 50 * 1024 // 50G
MaxReservedDays = 7 * 24 * time.Hour
)
const (
EnableAuditLogReqPath = "/auditlog/enable"
DisableAuditLogReqPath = "/auditlog/disable"
SetAuditLogBufSizeReqPath = "/auditlog/setbufsize"
)
const auditFullPathUnsupported = "(Audit full path unsupported)"
var DefaultTimeOutUs = [3]uint32{100000, 500000, 1000000}
type ShiftedFile []os.FileInfo
func (f ShiftedFile) Less(i, j int) bool {
return f[i].ModTime().Before(f[j].ModTime())
}
func (f ShiftedFile) Len() int {
return len(f)
}
func (f ShiftedFile) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}
//type typeInfo struct {
// typeName string
// allCount uint32
// failCount uint32
// maxTime time.Duration
// minTime time.Duration
// allTimeUs time.Duration
// timeOut [MaxTimeoutLevel]uint32
//}
type AuditPrefix struct {
prefixes []string
}
func NewAuditPrefix(p ...string) *AuditPrefix {
return &AuditPrefix{
prefixes: p,
}
}
func (a *AuditPrefix) String() string {
builder := strings.Builder{}
for _, p := range a.prefixes {
builder.WriteString(p)
builder.WriteString(", ")
}
return builder.String()
}
type Audit struct {
hostName string
ipAddr string
logDir string
logModule string
logMaxSize int64
logFileName string
logFile *os.File
writer *bufio.Writer
writerBufSize int
prefix *AuditPrefix
bufferC chan string
stopC chan struct{}
resetWriterBuffC chan int
pid int
lock sync.Mutex
}
var (
gAdt *Audit = nil
gAdtMutex sync.RWMutex
)
func getAddr() (HostName, IPAddr string) {
hostName, err := os.Hostname()
if err != nil {
HostName = "Unknown"
log.LogWarnf("Get host name failed, replaced by unknown. err(%v)", err)
} else {
HostName = hostName
}
addrs, err := net.InterfaceAddrs()
if err != nil {
IPAddr = "Unknown"
log.LogWarnf("Get ip address failed, replaced by unknown. err(%v)", err)
} else {
var ip_addrs []string
for _, addr := range addrs {
if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() && ipnet.IP.To4() != nil {
ip_addrs = append(ip_addrs, ipnet.IP.String())
}
}
if len(ip_addrs) > 0 {
IPAddr = strings.Join(ip_addrs, ",")
} else {
IPAddr = "Unknown"
log.LogWarnf("Get ip address failed, replaced by unknown. err(%v)", err)
}
}
return
}
// NOTE: for client http apis
func ResetWriterBuffSize(w http.ResponseWriter, r *http.Request) {
var err error
if err = r.ParseForm(); err != nil {
BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
size := int(DefaultAuditLogBufSize)
if sizeStr := r.FormValue("size"); sizeStr != "" {
val, err := strconv.Atoi(sizeStr)
if err != nil {
err = fmt.Errorf("size error")
BuildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
size = val
}
ResetWriterBufferSize(size)
BuildSuccessResp(w, "set audit log buffer size success")
}
func DisableAuditLog(w http.ResponseWriter, r *http.Request) {
StopAudit()
BuildSuccessResp(w, "disable audit log success")
}
func BuildSuccessResp(w http.ResponseWriter, data interface{}) {
buildJSONResp(w, http.StatusOK, data, "")
}
func BuildFailureResp(w http.ResponseWriter, code int, msg string) {
buildJSONResp(w, code, nil, msg)
}
// Create response for the API request.
func buildJSONResp(w http.ResponseWriter, code int, data interface{}, msg string) {
var (
jsonBody []byte
err error
)
w.WriteHeader(code)
w.Header().Set("Content-Type", "application/json")
body := struct {
Code int `json:"code"`
Data interface{} `json:"data"`
Msg string `json:"msg"`
}{
Code: code,
Data: data,
Msg: msg,
}
if jsonBody, err = json.Marshal(body); err != nil {
return
}
w.Write(jsonBody)
}
func (a *Audit) GetInfo() (dir, logModule string, logMaxSize int64) {
return a.logDir, a.logModule, a.logMaxSize
}
func NewAuditWithPrefix(dir, logModule string, logMaxSize int64, prefix *AuditPrefix) (a *Audit, err error) {
a, err = NewAudit(dir, logModule, logMaxSize)
if err != nil {
return nil, err
}
a.prefix = prefix
return a, nil
}
func NewAudit(dir, logModule string, logMaxSize int64) (*Audit, error) {
absPath, err := filepath.Abs(dir)
if err != nil {
return nil, err
}
host, ip := getAddr()
absPath = path.Join(absPath, logModule)
if !isPathSafe(absPath) {
return nil, errors.New("invalid file path")
}
fi, err := os.Stat(absPath)
if err != nil {
os.MkdirAll(absPath, 0o755)
} else {
if !fi.IsDir() {
return nil, errors.New(absPath + " is not a directory")
}
}
_ = os.Chmod(absPath, 0o755)
logName := path.Join(absPath, Audit_Module) + ".log"
audit := &Audit{
hostName: host,
ipAddr: ip,
logDir: absPath,
logModule: logModule,
logMaxSize: logMaxSize,
logFileName: logName,
writerBufSize: DefaultAuditLogBufSize,
bufferC: make(chan string, 1000),
prefix: nil,
stopC: make(chan struct{}),
resetWriterBuffC: make(chan int),
pid: os.Getpid(),
}
err = audit.newWriterSize(audit.writerBufSize)
if err != nil {
return nil, err
}
go audit.flushAuditLog()
return audit, nil
}
// NOTE:
// common header:
// [PREFIX] CURRENT_TIME TIME_ZONE
// format for client:
// [COMMON HEADER] IP_ADDR HOSTNAME OP SRC DST(Rename) ERR LATENCY SRC_INODE DST_INODE(Rename)
// format for server(inode):
// [COMMON HEADER] CLIENT_ADDR VOLUME OP ("nil") FULL_PATH ERR LATENCY INODE FILE_SIZE(Trunc)
// format for server(dentry):
// [COMMON HEADER] CLIENT_ADDR VOLUME OP NAME FULL_PATH ERR LATENCY INODE PARENT_INODE
// format for server(transaction):
// [COMMON HEADER] CLIENT_ADDR VOLUME OP TX_ID ("nil") ERR LATENCY TM_ID (0)
func (a *Audit) formatAuditEntry(ipAddr, hostName, op, src, dst string, err error, latency int64, srcInode, dstInode uint64) (entry string) {
var errStr string
if err != nil {
errStr = err.Error()
} else {
errStr = "nil"
}
curTime := time.Now()
curTimeStr := curTime.Format("2006-01-02 15:04:05")
timeZone, _ := curTime.Zone()
latencyStr := strconv.FormatInt(latency, 10) + " us"
srcInodeStr := strconv.FormatUint(srcInode, 10)
dstInodeStr := strconv.FormatUint(dstInode, 10)
entry = fmt.Sprintf("%s %s, %s, %s, %s, %s, %s, %s, %s, %s, %s",
curTimeStr, timeZone, ipAddr, hostName, op, src, dst, errStr, latencyStr, srcInodeStr, dstInodeStr)
return
}
func (a *Audit) LogClientOp(op, src, dst string, err error, latency int64, srcInode, dstInode uint64) {
a.formatLog(a.ipAddr, a.hostName, op, src, dst, err, latency, srcInode, dstInode)
}
func (a *Audit) LogDentryOp(clientAddr, volume, op, name, fullPath string, err error, latency int64, ino, parentIno uint64) {
if fullPath == "" {
fullPath = auditFullPathUnsupported
}
a.formatLog(clientAddr, volume, op, name, fullPath, err, latency, ino, parentIno)
}
func (a *Audit) LogInodeOp(clientAddr, volume, op, fullPath string, err error, latency int64, ino uint64, fileSize uint64) {
if fullPath == "" {
fullPath = auditFullPathUnsupported
}
a.formatLog(clientAddr, volume, op, "nil", fullPath, err, latency, ino, fileSize)
}
func (a *Audit) LogTxOp(clientAddr, volume, op, txId string, err error, latency int64) {
a.formatLog(clientAddr, volume, op, txId, "nil", err, latency, 0, 0)
}
func (a *Audit) formatLog(ipAddr, hostName, op, src, dst string, err error, latency int64, srcInode, dstInode uint64) {
if entry := a.formatAuditEntry(ipAddr, hostName, op, src, dst, err, latency, srcInode, dstInode); entry != "" {
if a.prefix != nil {
entry = fmt.Sprintf("%s%s", a.prefix.String(), entry)
}
a.AddLog(entry)
}
}
func (a *Audit) ResetWriterBufferSize(size int) {
a.lock.Lock()
defer a.lock.Unlock()
a.resetWriterBuffC <- size
}
func (a *Audit) AddLog(content string) {
a.lock.Lock()
defer a.lock.Unlock()
select {
case a.bufferC <- content:
return
default:
log.LogErrorf("async audit log failed, audit:[%s]", content)
}
}
// NOTE: global functions
func GetAuditLogInfo() (dir, logModule string, logMaxSize int64, err error) {
gAdtMutex.RLock()
defer gAdtMutex.RUnlock()
if gAdt != nil {
dir, logModule, logMaxSize = gAdt.GetInfo()
return
} else {
return "", "", 0, errors.New("audit log is not initialized yet")
}
}
func InitAuditWithPrefix(dir, logModule string, logMaxSize int64, prefix *AuditPrefix) (a *Audit, err error) {
a, err = InitAudit(dir, logModule, logMaxSize)
if err != nil {
return nil, err
}
a.prefix = prefix
return a, nil
}
func InitAudit(dir, logModule string, logMaxSize int64) (*Audit, error) {
gAdtMutex.Lock()
defer gAdtMutex.Unlock()
if gAdt == nil {
adt, err := NewAudit(dir, logModule, logMaxSize)
if err != nil {
return nil, err
}
gAdt = adt
}
return gAdt, nil
}
func LogClientOp(op, src, dst string, err error, latency int64, srcInode, dstInode uint64) {
gAdtMutex.RLock()
defer gAdtMutex.RUnlock()
if gAdt == nil {
return
}
gAdt.LogClientOp(op, src, dst, err, latency, srcInode, dstInode)
}
func LogDentryOp(clientAddr, volume, op, name, fullPath string, err error, latency int64, ino, parentIno uint64) {
gAdtMutex.RLock()
defer gAdtMutex.RUnlock()
if gAdt == nil {
return
}
gAdt.LogDentryOp(clientAddr, volume, op, name, fullPath, err, latency, ino, parentIno)
}
func LogInodeOp(clientAddr, volume, op, fullPath string, err error, latency int64, ino uint64, fileSize uint64) {
gAdtMutex.RLock()
defer gAdtMutex.RUnlock()
if gAdt == nil {
return
}
gAdt.LogInodeOp(clientAddr, volume, op, fullPath, err, latency, ino, fileSize)
}
func LogTxOp(clientAddr, volume, op, txId string, err error, latency int64) {
gAdtMutex.RLock()
defer gAdtMutex.RUnlock()
if gAdt == nil {
return
}
gAdt.LogTxOp(clientAddr, volume, op, txId, err, latency)
}
func ResetWriterBufferSize(size int) {
gAdtMutex.Lock()
defer gAdtMutex.Unlock()
if gAdt == nil {
return
}
gAdt.ResetWriterBufferSize(size)
}
func AddLog(content string) {
gAdtMutex.Lock()
defer gAdtMutex.Unlock()
if gAdt == nil {
return
}
gAdt.AddLog(content)
}
func StopAudit() {
gAdtMutex.Lock()
defer gAdtMutex.Unlock()
if gAdt == nil {
return
}
gAdt.Stop()
gAdt = nil
}
// NOTE: implementation details
func (a *Audit) flushAuditLog() {
cleanTimer := time.NewTimer(DefaultCleanInterval)
for {
select {
case <-a.stopC:
return
case bufSize := <-a.resetWriterBuffC:
a.writerBufSize = bufSize
a.newWriterSize(bufSize)
case aLog := <-a.bufferC:
a.logAudit(aLog)
case <-cleanTimer.C:
a.removeLogFile()
cleanTimer.Reset(DefaultCleanInterval)
}
}
}
func (a *Audit) newWriterSize(size int) error {
a.writerBufSize = size
if a.writer != nil {
a.writer.Flush()
}
if a.logFile == nil {
logFile, err := os.OpenFile(a.logFileName, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0o666)
if err != nil {
log.LogErrorf("newWriterSize failed, logFileName: %s, err: %v\n", a.logFileName, err)
return fmt.Errorf("OpenLogFile failed, logFileName %s", a.logFileName)
}
a.logFile = logFile
if size <= 0 {
log.LogDebugf("newWriterSize : buffer for logFileName: %s is disabled", a.logFileName)
a.writer = bufio.NewWriter(logFile)
} else {
a.writer = bufio.NewWriterSize(logFile, size)
}
} else {
_, err := a.logFile.Stat()
if err == nil {
if size <= 0 {
log.LogErrorf("newWriterSize : buffer for logFileName is disabled")
a.writer = bufio.NewWriter(a.logFile)
} else {
a.writer = bufio.NewWriterSize(a.logFile, size)
}
} else {
a.logFile.Close()
a.logFile = nil
return a.newWriterSize(size)
}
}
return nil
}
func (a *Audit) removeLogFile() {
fs := syscall.Statfs_t{}
if err := syscall.Statfs(a.logDir, &fs); err != nil {
log.LogErrorf("Get fs stat failed, err: %v", err)
return
}
diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
diskSpaceLeft -= DefaultHeadRoom * 1024 * 1024
fInfos, err := ioutil.ReadDir(a.logDir)
if err != nil {
log.LogErrorf("ReadDir failed, logDir: %s, err: %v", a.logDir, err)
return
}
var needDelFiles ShiftedFile
for _, info := range fInfos {
if a.shouldDelete(info, diskSpaceLeft, Audit_Module) {
needDelFiles = append(needDelFiles, info)
}
}
sort.Sort(needDelFiles)
for _, info := range needDelFiles {
if err = os.Remove(path.Join(a.logDir, info.Name())); err != nil {
log.LogErrorf("Remove log file failed, logFileName: %s, err: %v", info.Name(), err)
continue
}
diskSpaceLeft += info.Size()
if diskSpaceLeft > 0 && time.Since(info.ModTime()) < MaxReservedDays {
break
}
}
}
func (a *Audit) shouldDelete(info os.FileInfo, diskSpaceLeft int64, module string) bool {
isOldAuditLogFile := info.Mode().IsRegular() && strings.HasSuffix(info.Name(), ShiftedExtension) && strings.HasPrefix(info.Name(), module)
if diskSpaceLeft <= 0 {
return isOldAuditLogFile
}
return time.Since(info.ModTime()) > MaxReservedDays && isOldAuditLogFile
}
func (a *Audit) Stop() {
a.lock.Lock()
defer a.lock.Unlock()
close(a.stopC)
a.writer.Flush()
a.logFile.Close()
}
func (a *Audit) logAudit(content string) error {
a.shiftFiles()
fmt.Fprintf(a.writer, "%s\n", content)
if a.writerBufSize <= 0 {
a.writer.Flush()
}
return nil
}
func (a *Audit) shiftFiles() error {
fileInfo, err := os.Stat(a.logFileName)
if err != nil {
return err
}
if fileInfo.Size() < a.logMaxSize {
return nil
}
if syscall.Access(a.logFileName, F_OK) == nil {
logNewFileName := a.logFileName + "." + time.Now().Format(FileNameDateFormat) + ShiftedExtension
a.writer.Flush()
a.logFile.Close()
a.writer = nil
a.logFile = nil
if err = os.Rename(a.logFileName, logNewFileName); err != nil {
log.LogErrorf("RenameFile failed, logFileName: %s, logNewFileName: %s, err: %v\n",
a.logFileName, logNewFileName, err)
return fmt.Errorf("action[shiftFiles] renameFile failed, logFileName %s, logNewFileName %s",
a.logFileName, logNewFileName)
}
}
// NOTE: try to recycle space when shift file
a.removeLogFile()
return a.newWriterSize(a.writerBufSize)
}
func isPathSafe(filePath string) bool {
safePattern := `^[a-zA-Z0-9\-_/]+$`
match, _ := regexp.MatchString(safePattern, filePath)
return match
}
// Copyright 2014 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package btree implements in-memory B-Trees of arbitrary degree.
//
// btree implements an in-memory B-Tree for use as an ordered data structure.
// It is not meant for persistent storage solutions.
//
// It has a flatter structure than an equivalent red-black or other binary tree,
// which in some cases yields better memory usage and/or performance.
// See some discussion on the matter here:
// http://google-opensource.blogspot.com/2013/01/c-containers-that-save-memory-and-time.html
// Note, though, that this project is in no way related to the C++ B-Tree
// implementation written about there.
//
// Within this tree, each node contains a slice of items and a (possibly nil)
// slice of children. For basic numeric values or raw structs, this can cause
// efficiency differences when compared to equivalent C++ template code that
// stores values in arrays within the node:
// * Due to the overhead of storing values as interfaces (each
// value needs to be stored as the value itself, then 2 words for the
// interface pointing to that value and its type), resulting in higher
// memory use.
// * Since interfaces can point to values anywhere in memory, values are
// most likely not stored in contiguous blocks, resulting in a higher
// number of cache misses.
// These issues don't tend to matter, though, when working with strings or other
// heap-allocated structures, since C++-equivalent structures also must store
// pointers and also distribute their values across the heap.
//
// This implementation is designed to be a drop-in replacement to gollrb.LLRB
// trees, (http://github.com/petar/gollrb), an excellent and probably the most
// widely used ordered tree implementation in the Go ecosystem currently.
// Its functions, therefore, exactly mirror those of
// llrb.LLRB where possible. Unlike gollrb, though, we currently don't
// support storing multiple equivalent values.
package btree
import (
"fmt"
"io"
"sort"
"strings"
"sync"
)
// Item represents a single object in the tree.
type Item interface {
// Less tests whether the current item is less than the given argument.
//
// This must provide a strict weak ordering.
// If !a.Less(b) && !b.Less(a), we treat this to mean a == b (i.e. we can only
// hold one of either a or b in the tree).
Less(than Item) bool
Copy() Item
}
const (
DefaultFreeListSize = 32
)
var (
nilItems = make(items, 16)
nilChildren = make(children, 16)
)
// FreeList represents a free list of btree nodes. By default each
// BTree has its own FreeList, but multiple BTrees can share the same
// FreeList.
// Two Btrees using the same freelist are safe for concurrent write access.
type FreeList struct {
mu sync.Mutex
freelist []*node
}
// NewFreeList creates a new free list.
// size is the maximum size of the returned free list.
func NewFreeList(size int) *FreeList {
return &FreeList{freelist: make([]*node, 0, size)}
}
func (f *FreeList) newNode() (n *node) {
f.mu.Lock()
index := len(f.freelist) - 1
if index < 0 {
f.mu.Unlock()
return new(node)
}
n = f.freelist[index]
f.freelist[index] = nil
f.freelist = f.freelist[:index]
f.mu.Unlock()
return
}
// freeNode adds the given node to the list, returning true if it was added
// and false if it was discarded.
func (f *FreeList) freeNode(n *node) (out bool) {
f.mu.Lock()
if len(f.freelist) < cap(f.freelist) {
f.freelist = append(f.freelist, n)
out = true
}
f.mu.Unlock()
return
}
// ItemIterator allows callers of Ascend* to iterate in-order over portions of
// the tree. When this function returns false, iteration will stop and the
// associated Ascend* function will immediately return.
type ItemIterator func(i Item) bool
// New creates a new B-Tree with the given degree.
//
// New(2), for example, will create a 2-3-4 tree (each node contains 1-3 items
// and 2-4 children).
func New(degree int) *BTree {
return NewWithFreeList(degree, NewFreeList(DefaultFreeListSize))
}
func NewWithSize(degree, initSize int) *BTree {
return NewWithFreeList(degree, NewFreeList(initSize))
}
// NewWithFreeList creates a new B-Tree that uses the given node free list.
func NewWithFreeList(degree int, f *FreeList) *BTree {
if degree <= 1 {
panic("bad degree")
}
return &BTree{
degree: degree,
cow: ©OnWriteContext{freelist: f},
}
}
// items stores items in a node.
type items []Item
// insertAt inserts a value into the given index, pushing all subsequent values
// forward.
func (s *items) insertAt(index int, item Item) {
*s = append(*s, nil)
if index < len(*s) {
copy((*s)[index+1:], (*s)[index:])
}
(*s)[index] = item
}
// removeAt removes a value at a given index, pulling all subsequent values
// back.
func (s *items) removeAt(index int) Item {
item := (*s)[index]
copy((*s)[index:], (*s)[index+1:])
(*s)[len(*s)-1] = nil
*s = (*s)[:len(*s)-1]
return item
}
// pop removes and returns the last element in the list.
func (s *items) pop() (out Item) {
index := len(*s) - 1
out = (*s)[index]
(*s)[index] = nil
*s = (*s)[:index]
return
}
// copy copy and return new items
func (s *items) copy() items {
nItems := make(items, 0, len(*s))
for _, v := range *s {
nItems = append(nItems, v.Copy())
}
return nItems
}
// truncate truncates this instance at index so that it contains only the
// first index items. index must be less than or equal to length.
func (s *items) truncate(index int) {
var toClear items
*s, toClear = (*s)[:index], (*s)[index:]
for len(toClear) > 0 {
toClear = toClear[copy(toClear, nilItems):]
}
}
// find returns the index where the given item should be inserted into this
// list. 'found' is true if the item already exists in the list at the given
// index.
func (s items) find(item Item) (index int, found bool) {
i := sort.Search(len(s), func(i int) bool {
return item.Less(s[i])
})
if i > 0 && !s[i-1].Less(item) {
return i - 1, true
}
return i, false
}
// children stores child nodes in a node.
type children []*node
// insertAt inserts a value into the given index, pushing all subsequent values
// forward.
func (s *children) insertAt(index int, n *node) {
*s = append(*s, nil)
if index < len(*s) {
copy((*s)[index+1:], (*s)[index:])
}
(*s)[index] = n
}
// removeAt removes a value at a given index, pulling all subsequent values
// back.
func (s *children) removeAt(index int) *node {
n := (*s)[index]
copy((*s)[index:], (*s)[index+1:])
(*s)[len(*s)-1] = nil
*s = (*s)[:len(*s)-1]
return n
}
// pop removes and returns the last element in the list.
func (s *children) pop() (out *node) {
index := len(*s) - 1
out = (*s)[index]
(*s)[index] = nil
*s = (*s)[:index]
return
}
// truncate truncates this instance at index so that it contains only the
// first index children. index must be less than or equal to length.
func (s *children) truncate(index int) {
var toClear children
*s, toClear = (*s)[:index], (*s)[index:]
for len(toClear) > 0 {
toClear = toClear[copy(toClear, nilChildren):]
}
}
// node is an internal node in a tree.
//
// It must at all times maintain the invariant that either
// * len(children) == 0, len(items) unconstrained
// * len(children) == len(items) + 1
type node struct {
items items
children children
cow *copyOnWriteContext
}
func (n *node) mutableFor(cow *copyOnWriteContext) *node {
if n.cow == cow {
return n
}
out := cow.newNode()
if cap(out.items) >= len(n.items) {
out.items = out.items[:len(n.items)]
} else {
out.items = make(items, len(n.items), cap(n.items))
}
copy(out.items, n.items.copy())
// Copy children
if cap(out.children) >= len(n.children) {
out.children = out.children[:len(n.children)]
} else {
out.children = make(children, len(n.children), cap(n.children))
}
copy(out.children, n.children)
return out
}
func (n *node) mutableChild(i int) *node {
c := n.children[i].mutableFor(n.cow)
n.children[i] = c
return c
}
// split splits the given node at the given index. The current node shrinks,
// and this function returns the item that existed at that index and a new node
// containing all items/children after it.
func (n *node) split(i int) (Item, *node) {
item := n.items[i]
next := n.cow.newNode()
next.items = append(next.items, n.items[i+1:]...)
n.items.truncate(i)
if len(n.children) > 0 {
next.children = append(next.children, n.children[i+1:]...)
n.children.truncate(i + 1)
}
return item, next
}
// maybeSplitChild checks if a child should be split, and if so splits it.
// Returns whether or not a split occurred.
func (n *node) maybeSplitChild(i, maxItems int) bool {
if len(n.children[i].items) < maxItems {
return false
}
first := n.mutableChild(i)
item, second := first.split(maxItems / 2)
n.items.insertAt(i, item)
n.children.insertAt(i+1, second)
return true
}
// insert inserts an item into the subtree rooted at this node, making sure
// no nodes in the subtree exceed maxItems items. Should an equivalent item be
// be found/replaced by insert, it will be returned.
func (n *node) insert(item Item, maxItems int) Item {
i, found := n.items.find(item)
if found {
out := n.items[i]
n.items[i] = item
return out
}
if len(n.children) == 0 {
n.items.insertAt(i, item)
return nil
}
if n.maybeSplitChild(i, maxItems) {
inTree := n.items[i]
switch {
case item.Less(inTree):
// no change, we want first split node
case inTree.Less(item):
i++ // we want second split node
default:
out := n.items[i]
n.items[i] = item
return out
}
}
return n.mutableChild(i).insert(item, maxItems)
}
// get finds the given key in the subtree and returns it.
func (n *node) get(key Item) Item {
i, found := n.items.find(key)
if found {
return n.items[i]
} else if len(n.children) > 0 {
return n.children[i].get(key)
}
return nil
}
func (n *node) copyGet(key Item, cow *copyOnWriteContext) Item {
i, found := n.items.find(key)
if found {
return n.items[i]
} else if len(n.children) > 0 {
child := n.mutableChild(i)
return child.copyGet(key, cow)
}
return nil
}
// min returns the first item in the subtree.
func min(n *node) Item {
for len(n.children) > 0 {
n = n.children[0]
}
if len(n.items) == 0 {
return nil
}
return n.items[0]
}
// max returns the last item in the subtree.
func max(n *node) Item {
for len(n.children) > 0 {
n = n.children[len(n.children)-1]
}
if len(n.items) == 0 {
return nil
}
return n.items[len(n.items)-1]
}
// toRemove details what item to remove in a node.remove call.
type toRemove int
const (
removeItem toRemove = iota // removes the given item
removeMin // removes smallest item in the subtree
removeMax // removes largest item in the subtree
)
// remove removes an item from the subtree rooted at this node.
func (n *node) remove(item Item, minItems int, typ toRemove) Item {
var i int
var found bool
switch typ {
case removeMax:
if len(n.children) == 0 {
return n.items.pop()
}
i = len(n.items)
case removeMin:
if len(n.children) == 0 {
return n.items.removeAt(0)
}
i = 0
case removeItem:
i, found = n.items.find(item)
if len(n.children) == 0 {
if found {
return n.items.removeAt(i)
}
return nil
}
default:
panic("invalid type")
}
// If we get to here, we have children.
if len(n.children[i].items) <= minItems {
return n.growChildAndRemove(i, item, minItems, typ)
}
child := n.mutableChild(i)
// Either we had enough items to begin with, or we've done some
// merging/stealing, because we've got enough now and we're ready to return
// stuff.
if found {
// The item exists at index 'i', and the child we've selected can give us a
// predecessor, since if we've gotten here it's got > minItems items in it.
out := n.items[i]
// We use our special-case 'remove' call with typ=maxItem to pull the
// predecessor of item i (the rightmost leaf of our immediate left child)
// and set it into where we pulled the item from.
n.items[i] = child.remove(nil, minItems, removeMax)
return out
}
// Final recursive call. Once we're here, we know that the item isn't in this
// node and that the child is big enough to remove from.
return child.remove(item, minItems, typ)
}
// growChildAndRemove grows child 'i' to make sure it's possible to remove an
// item from it while keeping it at minItems, then calls remove to actually
// remove it.
//
// Most documentation says we have to do two sets of special casing:
// 1) item is in this node
// 2) item is in child
// In both cases, we need to handle the two subcases:
// A) node has enough values that it can spare one
// B) node doesn't have enough values
// For the latter, we have to check:
// a) left sibling has node to spare
// b) right sibling has node to spare
// c) we must merge
// To simplify our code here, we handle cases #1 and #2 the same:
// If a node doesn't have enough items, we make sure it does (using a,b,c).
// We then simply redo our remove call, and the second time (regardless of
// whether we're in case 1 or 2), we'll have enough items and can guarantee
// that we hit case A.
func (n *node) growChildAndRemove(i int, item Item, minItems int, typ toRemove) Item {
if i > 0 && len(n.children[i-1].items) > minItems {
// Steal from left child
child := n.mutableChild(i)
stealFrom := n.mutableChild(i - 1)
stolenItem := stealFrom.items.pop()
child.items.insertAt(0, n.items[i-1])
n.items[i-1] = stolenItem
if len(stealFrom.children) > 0 {
child.children.insertAt(0, stealFrom.children.pop())
}
} else if i < len(n.items) && len(n.children[i+1].items) > minItems {
// steal from right child
child := n.mutableChild(i)
stealFrom := n.mutableChild(i + 1)
stolenItem := stealFrom.items.removeAt(0)
child.items = append(child.items, n.items[i])
n.items[i] = stolenItem
if len(stealFrom.children) > 0 {
child.children = append(child.children, stealFrom.children.removeAt(0))
}
} else {
if i >= len(n.items) {
i--
}
child := n.mutableChild(i)
// merge with right child
mergeItem := n.items.removeAt(i)
mergeChild := n.children.removeAt(i + 1)
child.items = append(child.items, mergeItem)
child.items = append(child.items, mergeChild.items...)
child.children = append(child.children, mergeChild.children...)
n.cow.freeNode(mergeChild)
}
return n.remove(item, minItems, typ)
}
type direction int
const (
descend = direction(-1)
ascend = direction(+1)
)
// iterate provides a simple method for iterating over elements in the tree.
//
// When ascending, the 'start' should be less than 'stop' and when descending,
// the 'start' should be greater than 'stop'. Setting 'includeStart' to true
// will force the iterator to include the first item when it equals 'start',
// thus creating a "greaterOrEqual" or "lessThanEqual" rather than just a
// "greaterThan" or "lessThan" queries.
func (n *node) iterate(dir direction, start, stop Item, includeStart bool, hit bool, iter ItemIterator) (bool, bool) {
var ok, found bool
var index int
switch dir {
case ascend:
if start != nil {
index, _ = n.items.find(start)
}
for i := index; i < len(n.items); i++ {
if len(n.children) > 0 {
if hit, ok = n.children[i].iterate(dir, start, stop, includeStart, hit, iter); !ok {
return hit, false
}
}
if !includeStart && !hit && start != nil && !start.Less(n.items[i]) {
hit = true
continue
}
hit = true
if stop != nil && !n.items[i].Less(stop) {
return hit, false
}
if !iter(n.items[i]) {
return hit, false
}
}
if len(n.children) > 0 {
if hit, ok = n.children[len(n.children)-1].iterate(dir, start, stop, includeStart, hit, iter); !ok {
return hit, false
}
}
case descend:
if start != nil {
index, found = n.items.find(start)
if !found {
index = index - 1
}
} else {
index = len(n.items) - 1
}
for i := index; i >= 0; i-- {
if start != nil && !n.items[i].Less(start) {
if !includeStart || hit || start.Less(n.items[i]) {
continue
}
}
if len(n.children) > 0 {
if hit, ok = n.children[i+1].iterate(dir, start, stop, includeStart, hit, iter); !ok {
return hit, false
}
}
if stop != nil && !stop.Less(n.items[i]) {
return hit, false // continue
}
hit = true
if !iter(n.items[i]) {
return hit, false
}
}
if len(n.children) > 0 {
if hit, ok = n.children[0].iterate(dir, start, stop, includeStart, hit, iter); !ok {
return hit, false
}
}
default:
// do nothing
}
return hit, true
}
// Allocated for testing/debugging purposes.
func (n *node) print(w io.Writer, level int) {
fmt.Fprintf(w, "%sNODE:%v\n", strings.Repeat(" ", level), n.items)
for _, c := range n.children {
c.print(w, level+1)
}
}
// BTree is an implementation of a B-Tree.
//
// BTree stores Item instances in an ordered structure, allowing easy insertion,
// removal, and iteration.
//
// Write operations are not safe for concurrent mutation by multiple
// goroutines, but Read operations are.
type BTree struct {
degree int
length int
root *node
cow *copyOnWriteContext
}
// copyOnWriteContext pointers determine node ownership... a tree with a write
// context equivalent to a node's write context is allowed to modify that node.
// A tree whose write context does not match a node's is not allowed to modify
// it, and must create a new, writable copy (IE: it's a Clone).
//
// When doing any write operation, we maintain the invariant that the current
// node's context is equal to the context of the tree that requested the write.
// We do this by, before we descend into any node, creating a copy with the
// correct context if the contexts don't match.
//
// Since the node we're currently visiting on any write has the requesting
// tree's context, that node is modifiable in place. Children of that node may
// not share context, but before we descend into them, we'll make a mutable
// copy.
type copyOnWriteContext struct {
freelist *FreeList
}
// Clone clones the btree, lazily. Clone should not be called concurrently,
// but the original tree (t) and the new tree (t2) can be used concurrently
// once the Clone call completes.
//
// The internal tree structure of b is marked read-only and shared between t and
// t2. Writes to both t and t2 use copy-on-write logic, creating new nodes
// whenever one of b's original nodes would have been modified. Read operations
// should have no performance degredation. Write operations for both t and t2
// will initially experience minor slow-downs caused by additional allocs and
// copies due to the aforementioned copy-on-write logic, but should converge to
// the original performance characteristics of the original tree.
func (t *BTree) Clone() (t2 *BTree) {
// Create two entirely new copy-on-write contexts.
// This operation effectively creates three trees:
// the original, shared nodes (old b.cow)
// the new b.cow nodes
// the new out.cow nodes
cow1, cow2 := *t.cow, *t.cow
out := *t
t.cow = &cow1
out.cow = &cow2
return &out
}
// maxItems returns the max number of items to allow per node.
func (t *BTree) maxItems() int {
return t.degree*2 - 1
}
// minItems returns the min number of items to allow per node (ignored for the
// root node).
func (t *BTree) minItems() int {
return t.degree - 1
}
func (c *copyOnWriteContext) newNode() (n *node) {
n = c.freelist.newNode()
n.cow = c
return
}
type freeType int
const (
ftFreelistFull freeType = iota // node was freed (available for GC, not stored in freelist)
ftStored // node was stored in the freelist for later use
ftNotOwned // node was ignored by COW, since it's owned by another one
)
// freeNode frees a node within a given COW context, if it's owned by that
// context. It returns what happened to the node (see freeType const
// documentation).
func (c *copyOnWriteContext) freeNode(n *node) freeType {
if n.cow == c {
// clear to allow GC
n.items.truncate(0)
n.children.truncate(0)
n.cow = nil
if c.freelist.freeNode(n) {
return ftStored
} else {
return ftFreelistFull
}
} else {
return ftNotOwned
}
}
// ReplaceOrInsert adds the given item to the tree. If an item in the tree
// already equals the given one, it is removed from the tree and returned.
// Otherwise, nil is returned.
//
// nil cannot be added to the tree (will panic).
func (t *BTree) ReplaceOrInsert(item Item) Item {
if item == nil {
panic("nil item being added to BTree")
}
if t.root == nil {
t.root = t.cow.newNode()
t.root.items = append(t.root.items, item)
t.length++
return nil
} else {
t.root = t.root.mutableFor(t.cow)
if len(t.root.items) >= t.maxItems() {
item2, second := t.root.split(t.maxItems() / 2)
oldroot := t.root
t.root = t.cow.newNode()
t.root.items = append(t.root.items, item2)
t.root.children = append(t.root.children, oldroot, second)
}
}
out := t.root.insert(item, t.maxItems())
if out == nil {
t.length++
}
return out
}
// Delete removes an item equal to the passed in item from the tree, returning
// it. If no such item exists, returns nil.
func (t *BTree) Delete(item Item) Item {
return t.deleteItem(item, removeItem)
}
// DeleteMin removes the smallest item in the tree and returns it.
// If no such item exists, returns nil.
func (t *BTree) DeleteMin() Item {
return t.deleteItem(nil, removeMin)
}
// DeleteMax removes the largest item in the tree and returns it.
// If no such item exists, returns nil.
func (t *BTree) DeleteMax() Item {
return t.deleteItem(nil, removeMax)
}
func (t *BTree) deleteItem(item Item, typ toRemove) Item {
if t.root == nil || len(t.root.items) == 0 {
return nil
}
t.root = t.root.mutableFor(t.cow)
out := t.root.remove(item, t.minItems(), typ)
if len(t.root.items) == 0 && len(t.root.children) > 0 {
oldroot := t.root
t.root = t.root.children[0]
t.cow.freeNode(oldroot)
}
if out != nil {
t.length--
}
return out
}
// AscendRange calls the iterator for every value in the tree within the range
// [greaterOrEqual, lessThan), until iterator returns false.
func (t *BTree) AscendRange(greaterOrEqual, lessThan Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(ascend, greaterOrEqual, lessThan, true, false, iterator)
}
// AscendLessThan calls the iterator for every value in the tree within the range
// [first, pivot), until iterator returns false.
func (t *BTree) AscendLessThan(pivot Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(ascend, nil, pivot, false, false, iterator)
}
// AscendGreaterOrEqual calls the iterator for every value in the tree within
// the range [pivot, last], until iterator returns false.
func (t *BTree) AscendGreaterOrEqual(pivot Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(ascend, pivot, nil, true, false, iterator)
}
// AscendGreaterOrEqual calls the iterator for every value in the tree within
// the range [pivot, last], until iterator returns false.
func (t *BTree) AscendGreaterThan(pivot Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(ascend, pivot, nil, false, false, iterator)
}
// Ascend calls the iterator for every value in the tree within the range
// [first, last], until iterator returns false.
func (t *BTree) Ascend(iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(ascend, nil, nil, false, false, iterator)
}
// DescendRange calls the iterator for every value in the tree within the range
// [lessOrEqual, greaterThan), until iterator returns false.
func (t *BTree) DescendRange(lessOrEqual, greaterThan Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(descend, lessOrEqual, greaterThan, true, false, iterator)
}
// DescendLessOrEqual calls the iterator for every value in the tree within the range
// [pivot, first], until iterator returns false.
func (t *BTree) DescendLessOrEqual(pivot Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(descend, pivot, nil, true, false, iterator)
}
// DescendGreaterThan calls the iterator for every value in the tree within
// the range (pivot, last], until iterator returns false.
func (t *BTree) DescendGreaterThan(pivot Item, iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(descend, nil, pivot, false, false, iterator)
}
// Descend calls the iterator for every value in the tree within the range
// [last, first], until iterator returns false.
func (t *BTree) Descend(iterator ItemIterator) {
if t.root == nil {
return
}
t.root.iterate(descend, nil, nil, false, false, iterator)
}
// Get looks for the key item in the tree, returning it. It returns nil if
// unable to find that item.
func (t *BTree) Get(key Item) Item {
if t.root == nil {
return nil
}
return t.root.get(key)
}
func (t *BTree) CopyGet(key Item) Item {
if t.root == nil {
return nil
}
t.root = t.root.mutableFor(t.cow)
item := t.root.copyGet(key, t.cow)
return item
}
// Min returns the smallest item in the tree, or nil if the tree is empty.
func (t *BTree) Min() Item {
if t.root == nil {
return nil
}
return min(t.root)
}
// Max returns the largest item in the tree, or nil if the tree is empty.
func (t *BTree) Max() Item {
if t.root == nil {
return nil
}
return max(t.root)
}
// Has returns true if the given key is in the tree.
func (t *BTree) Has(key Item) bool {
return t.Get(key) != nil
}
// Len returns the number of items currently in the tree.
func (t *BTree) Len() int {
return t.length
}
// Clear removes all items from the btree. If addNodesToFreelist is true,
// t's nodes are added to its freelist as part of this call, until the freelist
// is full. Otherwise, the root node is simply dereferenced and the subtree
// left to Go's normal GC processes.
//
// This can be much faster
// than calling Delete on all elements, because that requires finding/removing
// each element in the tree and updating the tree accordingly. It also is
// somewhat faster than creating a new tree to replace the old one, because
// nodes from the old tree are reclaimed into the freelist for use by the new
// one, instead of being lost to the garbage collector.
//
// This call takes:
// O(1): when addNodesToFreelist is false, this is a single operation.
// O(1): when the freelist is already full, it breaks out immediately
// O(freelist size): when the freelist is empty and the nodes are all owned
// by this tree, nodes are added to the freelist until full.
// O(tree size): when all nodes are owned by another tree, all nodes are
// iterated over looking for nodes to add to the freelist, and due to
// ownership, none are.
func (t *BTree) Clear(addNodesToFreelist bool) {
if t.root != nil && addNodesToFreelist {
t.root.reset(t.cow)
}
t.root, t.length = nil, 0
}
// reset returns a subtree to the freelist. It breaks out immediately if the
// freelist is full, since the only benefit of iterating is to fill that
// freelist up. Returns true if parent reset call should continue.
func (n *node) reset(c *copyOnWriteContext) bool {
for _, child := range n.children {
if !child.reset(c) {
return false
}
}
return c.freeNode(n) != ftFreelistFull
}
// Int implements the Item interface for integers.
type Int int
func (a Int) Copy() Item {
return a
}
// Less returns true if int(a) < int(b).
func (a Int) Less(b Item) bool {
return a < b.(Int)
}
package buf
import (
"context"
"sync"
"sync/atomic"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
"golang.org/x/time/rate"
)
var (
bcacheTotalLimit int64
bcacheRateLimit = rate.NewLimiter(rate.Limit(1), 16)
bcacheCount int64
BCachePool *FileBCachePool
)
func newBlockCachePool(blockSize int) *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if atomic.LoadInt64(&bcacheCount) >= bcacheTotalLimit {
log.LogWarnf("FileBCachePool: bcacheCount=(%v),bcacheTotalLimit=(%v)", atomic.LoadInt64(&bcacheCount), bcacheTotalLimit)
ctx := context.Background()
bcacheRateLimit.Wait(ctx)
}
return make([]byte, blockSize)
},
}
}
type FileBCachePool struct {
pool *sync.Pool
}
func InitbCachePool(blockSize int) {
if blockSize == 0 {
return
}
BCachePool = &FileBCachePool{}
bcacheTotalLimit = int64((4 * util.GB) / blockSize)
BCachePool.pool = newBlockCachePool(blockSize)
}
func (fileCachePool *FileBCachePool) Get() []byte {
atomic.AddInt64(&bcacheCount, 1)
return fileCachePool.pool.Get().([]byte)
}
func (fileCachePool *FileBCachePool) Put(data []byte) {
atomic.AddInt64(&bcacheCount, -1)
fileCachePool.pool.Put(data)
}
package buf
import (
"context"
"fmt"
"sync"
"sync/atomic"
"github.com/cubefs/cubefs/util"
"golang.org/x/time/rate"
)
const (
HeaderBufferPoolSize = 8192
InvalidLimit = 0
)
var ReadBufPool = sync.Pool{
New: func() interface{} {
b := make([]byte, 32*1024)
return b
},
}
const (
BufferTypeHeader = 0
BufferTypeNormal = 1
BufferTypeHeaderVer = 2
)
var (
tinyBuffersTotalLimit int64 = 4096
NormalBuffersTotalLimit int64
HeadBuffersTotalLimit int64
HeadVerBuffersTotalLimit int64
)
var (
tinyBuffersCount int64
normalBuffersCount int64
headBuffersCount int64
headVerBuffersCount int64
)
var (
normalBufAllocId uint64
headBufAllocId uint64
headBufVerAllocId uint64
)
var (
normalBufFreecId uint64
headBufFreeId uint64
headBufVerFreeId uint64
)
var (
buffersRateLimit = rate.NewLimiter(rate.Limit(16), 16)
normalBuffersRateLimit = rate.NewLimiter(rate.Limit(16), 16)
headBuffersRateLimit = rate.NewLimiter(rate.Limit(16), 16)
headVerBuffersRateLimit = rate.NewLimiter(rate.Limit(16), 16)
)
func NewTinyBufferPool() *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if atomic.LoadInt64(&tinyBuffersCount) >= tinyBuffersTotalLimit {
ctx := context.Background()
buffersRateLimit.Wait(ctx)
}
return make([]byte, util.DefaultTinySizeLimit)
},
}
}
func NewHeadVerBufferPool() *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if HeadVerBuffersTotalLimit != InvalidLimit && atomic.LoadInt64(&headVerBuffersCount) >= HeadVerBuffersTotalLimit {
ctx := context.Background()
headVerBuffersRateLimit.Wait(ctx)
}
return make([]byte, util.PacketHeaderVerSize)
},
}
}
func NewHeadBufferPool() *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if HeadBuffersTotalLimit != InvalidLimit && atomic.LoadInt64(&headBuffersCount) >= HeadBuffersTotalLimit {
ctx := context.Background()
headBuffersRateLimit.Wait(ctx)
}
return make([]byte, util.PacketHeaderSize)
},
}
}
func NewNormalBufferPool() *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if NormalBuffersTotalLimit != InvalidLimit && atomic.LoadInt64(&normalBuffersCount) >= NormalBuffersTotalLimit {
ctx := context.Background()
normalBuffersRateLimit.Wait(ctx)
}
return make([]byte, util.BlockSize)
},
}
}
// BufferPool defines the struct of a buffered pool with 4 objects.
type BufferPool struct {
headPools []chan []byte
headVerPools []chan []byte
normalPools []chan []byte
tinyPool *sync.Pool
headPool *sync.Pool
normalPool *sync.Pool
headVerPool *sync.Pool
}
var slotCnt = uint64(16)
// NewBufferPool returns a new buffered pool.
func NewBufferPool() (bufferP *BufferPool) {
bufferP = &BufferPool{}
bufferP.headPools = make([]chan []byte, slotCnt)
bufferP.normalPools = make([]chan []byte, slotCnt)
bufferP.headVerPools = make([]chan []byte, slotCnt)
for i := 0; i < int(slotCnt); i++ {
bufferP.headPools[i] = make(chan []byte, HeaderBufferPoolSize/slotCnt)
bufferP.headVerPools[i] = make(chan []byte, HeaderBufferPoolSize/slotCnt)
bufferP.normalPools[i] = make(chan []byte, HeaderBufferPoolSize/slotCnt)
}
bufferP.tinyPool = NewTinyBufferPool()
bufferP.headPool = NewHeadBufferPool()
bufferP.headVerPool = NewHeadVerBufferPool()
bufferP.normalPool = NewNormalBufferPool()
return bufferP
}
func (bufferP *BufferPool) getHead(id uint64) (data []byte) {
select {
case data = <-bufferP.headPools[id%slotCnt]:
return
default:
return bufferP.headPool.Get().([]byte)
}
}
func (bufferP *BufferPool) getHeadVer(id uint64) (data []byte) {
select {
case data = <-bufferP.headVerPools[id%slotCnt]:
return
default:
return bufferP.headVerPool.Get().([]byte)
}
}
func (bufferP *BufferPool) getNoraml(id uint64) (data []byte) {
select {
case data = <-bufferP.normalPools[id%slotCnt]:
return
default:
return bufferP.normalPool.Get().([]byte)
}
}
// Get returns the data based on the given size. Different size corresponds to different object in the pool.
func (bufferP *BufferPool) Get(size int) (data []byte, err error) {
if size == util.PacketHeaderSize {
atomic.AddInt64(&headBuffersCount, 1)
id := atomic.AddUint64(&headBufAllocId, 1)
return bufferP.getHead(id), nil
} else if size == util.PacketHeaderVerSize {
atomic.AddInt64(&headVerBuffersCount, 1)
id := atomic.AddUint64(&headBufVerAllocId, 1)
return bufferP.getHeadVer(id), nil
} else if size == util.BlockSize {
atomic.AddInt64(&normalBuffersCount, 1)
id := atomic.AddUint64(&normalBufAllocId, 1)
return bufferP.getNoraml(id), nil
} else if size == util.DefaultTinySizeLimit {
atomic.AddInt64(&tinyBuffersCount, 1)
return bufferP.tinyPool.Get().([]byte), nil
}
return nil, fmt.Errorf("can only support 45 or 65536 bytes")
}
func (bufferP *BufferPool) putHead(index int, data []byte) {
select {
case bufferP.headPools[index] <- data:
return
default:
bufferP.headPool.Put(data)
}
}
func (bufferP *BufferPool) putHeadVer(index int, data []byte) {
select {
case bufferP.headVerPools[index] <- data:
return
default:
bufferP.headVerPool.Put(data)
}
}
func (bufferP *BufferPool) putNormal(index int, data []byte) {
select {
case bufferP.normalPools[index] <- data:
return
default:
bufferP.normalPool.Put(data)
}
}
// Put puts the given data into the buffer pool.
func (bufferP *BufferPool) Put(data []byte) {
if data == nil {
return
}
size := len(data)
if size == util.PacketHeaderSize {
atomic.AddInt64(&headBuffersCount, -1)
id := atomic.AddUint64(&headBufFreeId, 1)
bufferP.putHead(int(id%slotCnt), data)
} else if size == util.PacketHeaderVerSize {
atomic.AddInt64(&headVerBuffersCount, -1)
id := atomic.AddUint64(&headBufVerFreeId, 1)
bufferP.putHeadVer(int(id%slotCnt), data)
} else if size == util.BlockSize {
atomic.AddInt64(&normalBuffersCount, -1)
id := atomic.AddUint64(&normalBufFreecId, 1)
bufferP.putNormal(int(id%slotCnt), data)
} else if size == util.DefaultTinySizeLimit {
bufferP.tinyPool.Put(data)
atomic.AddInt64(&tinyBuffersCount, -1)
}
}
package buf
import (
"context"
"sync"
"sync/atomic"
"github.com/cubefs/cubefs/util"
"github.com/cubefs/cubefs/util/log"
"golang.org/x/time/rate"
)
var (
cacheTotalLimit int64
cacheRateLimit = rate.NewLimiter(rate.Limit(16), 16)
cacheCount int64
CachePool *FileCachePool
)
func newWriterCachePool(blockSize int) *sync.Pool {
return &sync.Pool{
New: func() interface{} {
if atomic.LoadInt64(&cacheCount) >= cacheTotalLimit {
ctx := context.Background()
cacheRateLimit.Wait(ctx)
}
return make([]byte, blockSize)
},
}
}
type FileCachePool struct {
pool *sync.Pool
}
func InitCachePool(blockSize int) {
if blockSize == 0 {
return
}
CachePool = &FileCachePool{}
cacheTotalLimit = int64((4 * util.GB) / blockSize)
CachePool.pool = newWriterCachePool(blockSize)
}
func (fileCachePool *FileCachePool) Get() []byte {
atomic.AddInt64(&cacheCount, 1)
return fileCachePool.pool.Get().([]byte)
}
func (fileCachePool *FileCachePool) Put(data []byte) {
log.LogInfof("action[FileCachePool.put] %v", fileCachePool)
log.LogInfof("action[FileCachePool.put] pool %v", fileCachePool.pool)
atomic.AddInt64(&cacheCount, -1)
fileCachePool.pool.Put(data)
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package caps
import (
"encoding/json"
"fmt"
"regexp"
"strings"
)
// Caps defines the capability type
type Caps struct {
API []string
OwnerVOL []string
NoneOwnerVOL []string
}
// ContainCaps whether contain a capability with kind
func (c *Caps) ContainCaps(cat string, cap string) (r bool) {
if cat == "API" {
return traversalCaps(c.API, cap)
} else if cat == "OwnerVOL" {
return traversalCaps(c.OwnerVOL, cap)
} else if cat == "NoneOwnerVOL" {
return traversalCaps(c.NoneOwnerVOL, cap)
}
return false
}
func traversalCaps(caps []string, cap string) (r bool) {
r = false
for _, s := range caps {
a := strings.Split(s, ":")
b := strings.Split(cap, ":")
i := 0
for ; i < 3; i++ {
if a[i] != "*" && a[i] != b[i] {
break
}
}
if i == 3 {
r = true
break
}
}
return
}
// Init init a Caps instance
func (c *Caps) Init(b []byte) (err error) {
if err = json.Unmarshal(b, c); err != nil {
return
}
if err = c.check(); err != nil {
return
}
c.cleanDup()
return
}
// Dump dump the content of Caps
func (c *Caps) Dump() (d string) {
for _, s := range c.API {
d += fmt.Sprintf("API:%s,", s)
}
// TODO c.vol (no usage?)
return
}
// Union union caps
func (c *Caps) Union(caps *Caps) {
c.API = append(c.API, caps.API...)
c.OwnerVOL = append(c.OwnerVOL, caps.OwnerVOL...)
c.NoneOwnerVOL = append(c.NoneOwnerVOL, caps.NoneOwnerVOL...)
c.cleanDup()
}
func (c *Caps) check() (err error) {
apiRe := regexp.MustCompile("^[A-Za-z0-9*]{1,20}:[A-Za-z0-9*]{1,20}:[A-Za-z0-9*]{1,20}$")
volRe := regexp.MustCompile("^[A-Za-z0-9*]{1,20}:[a-zA-Z0-9_-]{3,256}:[A-Za-z0-9*]{1,20}$")
if err = checkRegexp(apiRe, c.API); err != nil {
return
}
if err = checkRegexp(volRe, c.OwnerVOL); err != nil {
return
}
if err = checkRegexp(volRe, c.NoneOwnerVOL); err != nil {
return
}
return
}
func checkRegexp(re *regexp.Regexp, caps []string) (err error) {
for _, cap := range caps {
if !re.MatchString(cap) {
err = fmt.Errorf("invalid cap [%s]", cap)
return
}
}
return
}
// Delete delete caps
func (c *Caps) Delete(caps *Caps) {
c.API = deleteCaps(c.API, caps.API)
c.OwnerVOL = deleteCaps(c.OwnerVOL, caps.OwnerVOL)
c.NoneOwnerVOL = deleteCaps(c.NoneOwnerVOL, caps.NoneOwnerVOL)
}
func deleteCaps(caps []string, deleteCaps []string) []string {
m := make(map[string]bool)
for _, item := range caps {
m[item] = true
}
caps = []string{}
for _, item := range deleteCaps {
delete(m, item)
}
for k := range m {
caps = append(caps, k)
}
return caps
}
func (c *Caps) cleanDup() {
c.API = cleanCaps(c.API)
c.OwnerVOL = cleanCaps(c.OwnerVOL)
c.NoneOwnerVOL = cleanCaps(c.NoneOwnerVOL)
}
func cleanCaps(caps []string) []string {
newCaps := make([]string, 0)
m := make(map[string]map[string]bool)
for _, cap := range caps {
a := strings.Split(cap, ":")
key1 := a[0]
key2 := a[1] + ":" + a[2]
if _, ok := m[key1]; !ok {
m[key1] = make(map[string]bool)
}
if _, ok := m[key1][key2]; !ok {
newCaps = append(newCaps, cap)
m[key1][key2] = true
}
}
return newCaps
}
package util
/*
* Copyright 2016, 2017 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import (
"fmt"
"os"
"golang.org/x/sys/unix"
)
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
// that unix.UnixRights appears to make the assumption that fd is always int32,
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
scms, err := unix.ParseSocketControlMessage(oob)
if err != nil {
return nil, err
}
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
return SendFds(socket, []byte(name), int(fd))
}
// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
oob := unix.UnixRights(fds...)
return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package compressor
const EncodingGzip = "gzip"
// Compressor bytes compressor.
// TODO: add stream Compressor.
type Compressor interface {
Compress([]byte) ([]byte, error)
Decompress([]byte) ([]byte, error)
}
type none struct{}
func (none) Compress(pb []byte) ([]byte, error) { return pb, nil }
func (none) Decompress(cb []byte) ([]byte, error) { return cb, nil }
var compressors = make(map[string]func() Compressor)
func init() {
compressors[""] = func() Compressor { return none{} }
compressors[EncodingGzip] = func() Compressor { return gzipCompressor{} }
}
func New(encoding string) Compressor {
if newCompressor, ok := compressors[encoding]; ok {
return newCompressor()
}
return compressors[""]()
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package compressor
import (
"bytes"
"compress/gzip"
"io"
)
// TODO: reuse bytes.Buffer
type gzipCompressor struct{}
func (gzipCompressor) Compress(pb []byte) ([]byte, error) {
buffer := new(bytes.Buffer)
gw := gzip.NewWriter(buffer)
if _, err := gw.Write(pb); err != nil {
return nil, err
}
if err := gw.Close(); err != nil {
return nil, err
}
return buffer.Bytes(), nil
}
func (gzipCompressor) Decompress(cb []byte) ([]byte, error) {
gr, err := gzip.NewReader(bytes.NewBuffer(cb))
if err != nil {
return nil, err
}
buffer := new(bytes.Buffer)
if _, err := io.Copy(buffer, gr); err != nil {
return nil, err
}
if err := gr.Close(); err != nil {
return nil, err
}
return buffer.Bytes(), nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package config
import (
"encoding/json"
"fmt"
"log"
"os"
"path"
"strconv"
"strings"
)
const (
DefaultConstConfigFile = "constcfg"
ClusterVersionFile = "CLUSTER-VERSION"
ClusterUUID = "ClusterUUID"
)
// Config defines the struct of a configuration in general.
type Config struct {
data map[string]interface{}
Raw []byte
}
func newConfig() *Config {
result := new(Config)
result.data = make(map[string]interface{})
return result
}
// LoadConfigFile loads config information from a JSON file.
func LoadConfigFile(filename string) (*Config, error) {
result := newConfig()
err := result.parse(filename)
if err != nil {
log.Printf("error loading config file %s: %s", filename, err)
}
return result, err
}
// LoadConfigString loads config information from a JSON string.
func LoadConfigString(s string) *Config {
result := newConfig()
decoder := json.NewDecoder(strings.NewReader(s))
decoder.UseNumber()
err := decoder.Decode(&result.data)
if err != nil {
log.Fatalf("error parsing config string %s: %s", s, err)
}
return result
}
func (c *Config) parse(fileName string) error {
jsonFileBytes, err := os.ReadFile(fileName)
c.Raw = jsonFileBytes
if err == nil {
decoder := json.NewDecoder(strings.NewReader(string(jsonFileBytes)))
decoder.UseNumber()
err = decoder.Decode(&c.data)
}
return err
}
// GetValue returns the raw data for the config key.
func (c *Config) GetValue(key string) interface{} {
return c.data[key]
}
// GetString returns a string for the config key.
func (c *Config) GetString(key string) string {
x, present := c.data[key]
if !present {
return ""
}
if result, isString := x.(string); isString {
return result
}
return ""
}
// GetString returns a string for the config key.
func (c *Config) SetString(key, val string) {
c.data[key] = val
}
// GetFloat returns a float value for the config key.
func (c *Config) GetFloat(key string) float64 {
x, present := c.data[key]
if !present {
return -1
}
if result, isNumber := x.(json.Number); isNumber {
number, err := result.Float64()
if err != nil {
return 0
}
return number
}
return 0
}
// returns a bool value for the config key with default val when not present
func (c *Config) GetBoolWithDefault(key string, defval bool) bool {
_, present := c.data[key]
if !present {
return defval
}
return c.GetBool(key)
}
// GetBool returns a bool value for the config key.
func (c *Config) GetBool(key string) bool {
x, present := c.data[key]
if !present {
return false
}
if result, isBool := x.(bool); isBool {
return result
}
if result, isString := x.(string); isString {
if result == "true" {
return true
}
}
return false
}
// GetInt returns a int value for the config key.
func (c *Config) GetInt(key string) int {
return int(c.GetInt64(key))
}
// GetInt64 returns a int64 value for the config key.
func (c *Config) GetInt64(key string) int64 {
x, present := c.data[key]
if !present {
return 0
}
if result, isNumber := x.(json.Number); isNumber {
number, err := result.Int64()
if err != nil {
return 0
}
return number
}
// TODO: change all int64 setting with string configurations to int64
// try parse int64 from string
if numStr, isString := x.(string); isString {
number, err := strconv.ParseInt(numStr, 10, 64)
if err == nil {
return number
}
}
return 0
}
func (c *Config) HasKey(key string) bool {
_, present := c.data[key]
return present
}
// GetInt64WithDefault returns a int64 value for the config key.
func (c *Config) GetInt64WithDefault(key string, defaultVal int64) int64 {
if val := c.GetInt64(key); val == 0 {
return defaultVal
} else {
return val
}
}
// GetInt returns a int value for the config key with default value.
func (c *Config) GetIntWithDefault(key string, defaultVal int) int {
val := int(c.GetInt64(key))
if val == 0 {
return defaultVal
}
return val
}
// GetSlice returns an array for the config key.
func (c *Config) GetSlice(key string) []interface{} {
result, present := c.data[key]
if !present {
return []interface{}(nil)
}
return result.([]interface{})
}
func (c *Config) GetStringSlice(key string) []string {
s := c.GetSlice(key)
result := make([]string, 0, len(s))
for _, item := range s {
result = append(result, item.(string))
}
return result
}
// Check and get a string for the config key.
func (c *Config) CheckAndGetString(key string) (string, bool) {
x, present := c.data[key]
if !present {
return "", false
}
if result, isString := x.(string); isString {
return result, true
}
return "", false
}
// GetBool returns a bool value for the config key.
func (c *Config) CheckAndGetBool(key string) (bool, bool) {
x, present := c.data[key]
if !present {
return false, false
}
if result, isBool := x.(bool); isBool {
return result, true
}
// Take string value "true" and "false" as well.
if result, isString := x.(string); isString {
if result == "true" {
return true, true
}
if result == "false" {
return false, true
}
}
return false, false
}
func NewIllegalConfigError(configKey string) error {
return fmt.Errorf("illegal config %s", configKey)
}
type ConstConfig struct {
Listen string `json:"listen"`
RaftReplicaPort string `json:"raftReplicaPort"`
RaftHeartbetPort string `json:"raftHeartbetPort"`
}
func (ccfg *ConstConfig) Equals(cfg *ConstConfig) bool {
return (ccfg.Listen == cfg.Listen &&
ccfg.RaftHeartbetPort == cfg.RaftHeartbetPort &&
ccfg.RaftReplicaPort == cfg.RaftReplicaPort)
}
// check listen port, raft replica port and raft heartbeat port
func CheckOrStoreConstCfg(fileDir, fileName string, cfg *ConstConfig) (ok bool, err error) {
filePath := path.Join(fileDir, fileName)
var buf []byte
buf, err = os.ReadFile(filePath)
if err != nil && !os.IsNotExist(err) {
return false, fmt.Errorf("read config file %v failed: %v", filePath, err)
}
if os.IsNotExist(err) || len(buf) == 0 {
// Persist configuration to disk
if buf, err = json.Marshal(cfg); err != nil {
return false, fmt.Errorf("marshal const config failed: %v", err)
}
if err = os.MkdirAll(fileDir, 0o755); err != nil {
return false, fmt.Errorf("make directory %v filed: %v", fileDir, err)
}
var file *os.File
if file, err = os.OpenFile(filePath, os.O_CREATE|os.O_RDWR, 0o755); err != nil {
return false, fmt.Errorf("create config file %v failed: %v", filePath, err)
}
defer func() {
_ = file.Close()
if err != nil {
_ = os.Remove(filePath)
}
}()
if _, err = file.Write(buf); err != nil {
return false, fmt.Errorf("write config file %v failed: %v", filePath, err)
}
if err = file.Sync(); err != nil {
return false, fmt.Errorf("sync config file %v failed: %v", filePath, err)
}
return true, nil
}
// Load and check stored const configuration
storedConstCfg := new(ConstConfig)
if err = json.Unmarshal(buf, storedConstCfg); err != nil {
return false, fmt.Errorf("unmarshal const config %v failed: %v", filePath, err)
}
if ok := storedConstCfg.Equals(cfg); !ok {
return false, fmt.Errorf("compare const config %v and %v failed: %v", storedConstCfg, cfg, err)
}
return true, nil
}
func CheckOrStoreClusterUuid(dirPath, id string, force bool) (err error) {
dir, err := os.ReadDir(dirPath)
if err != nil {
return fmt.Errorf("read dir %v failed: %v", dirPath, err.Error())
}
versionFile := path.Join(dirPath, ClusterVersionFile)
if len(dir) == 0 || force {
// store clusterUUID
ClusterMap := map[string]interface{}{"ClusterUUID": id}
data, err := json.Marshal(ClusterMap)
if err != nil {
return fmt.Errorf("json marshal failed: %v", err.Error())
}
if err = os.WriteFile(versionFile, data, 0o755); err != nil {
return fmt.Errorf("write file %v failed: %v", versionFile, err.Error())
}
} else {
// check clusterUUID
cfg, err := LoadConfigFile(versionFile)
if err != nil {
return fmt.Errorf("read file %v failed: %v\n", versionFile, err.Error())
}
clusterUuId := cfg.GetString(ClusterUUID)
if clusterUuId != id {
return fmt.Errorf("file %v ClusterUUID %v not equal to %v\n",
versionFile, clusterUuId, id)
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"net"
"sync"
"time"
)
type Object struct {
conn *net.TCPConn
idle int64
}
const (
ConnectIdleTime = 30
defaultConnectTimeout = 1
)
type ConnectPool struct {
sync.RWMutex
pools map[string]*Pool
mincap int
maxcap int
timeout int64
connectTimeout int64
closeCh chan struct{}
closeOnce sync.Once
}
func NewConnectPool() (cp *ConnectPool) {
cp = &ConnectPool{
pools: make(map[string]*Pool),
mincap: 5,
maxcap: 500,
timeout: int64(time.Second * ConnectIdleTime),
connectTimeout: defaultConnectTimeout,
closeCh: make(chan struct{}),
}
go cp.autoRelease()
return cp
}
func NewConnectPoolWithTimeout(idleConnTimeout time.Duration, connectTimeout int64) (cp *ConnectPool) {
cp = &ConnectPool{
pools: make(map[string]*Pool),
mincap: 5,
maxcap: 80,
timeout: int64(idleConnTimeout * time.Second),
connectTimeout: connectTimeout,
closeCh: make(chan struct{}),
}
go cp.autoRelease()
return cp
}
func DailTimeOut(target string, timeout time.Duration) (c *net.TCPConn, err error) {
var connect net.Conn
connect, err = net.DialTimeout("tcp", target, timeout)
if err == nil {
conn := connect.(*net.TCPConn)
conn.SetKeepAlive(true)
conn.SetNoDelay(true)
c = conn
}
return
}
func (cp *ConnectPool) GetConnect(targetAddr string) (c *net.TCPConn, err error) {
cp.RLock()
pool, ok := cp.pools[targetAddr]
cp.RUnlock()
if !ok {
newPool := NewPool(cp.mincap, cp.maxcap, cp.timeout, cp.connectTimeout, targetAddr)
cp.Lock()
pool, ok = cp.pools[targetAddr]
if !ok {
// pool = NewPool(cp.mincap, cp.maxcap, cp.timeout, cp.connectTimeout, targetAddr)
pool = newPool
cp.pools[targetAddr] = pool
}
cp.Unlock()
}
return pool.GetConnectFromPool()
}
func (cp *ConnectPool) PutConnect(c *net.TCPConn, forceClose bool) {
if c == nil {
return
}
if forceClose {
_ = c.Close()
return
}
select {
case <-cp.closeCh:
_ = c.Close()
return
default:
}
addr := c.RemoteAddr().String()
cp.RLock()
pool, ok := cp.pools[addr]
cp.RUnlock()
if !ok {
c.Close()
return
}
object := &Object{conn: c, idle: time.Now().UnixNano()}
pool.PutConnectObjectToPool(object)
}
func (cp *ConnectPool) autoRelease() {
timer := time.NewTimer(time.Second)
for {
select {
case <-cp.closeCh:
timer.Stop()
return
case <-timer.C:
}
pools := make([]*Pool, 0)
cp.RLock()
for _, pool := range cp.pools {
pools = append(pools, pool)
}
cp.RUnlock()
for _, pool := range pools {
pool.autoRelease()
}
timer.Reset(time.Second)
}
}
func (cp *ConnectPool) releaseAll() {
pools := make([]*Pool, 0)
cp.RLock()
for _, pool := range cp.pools {
pools = append(pools, pool)
}
cp.RUnlock()
for _, pool := range pools {
pool.ReleaseAll()
}
}
func (cp *ConnectPool) Close() {
cp.closeOnce.Do(func() {
close(cp.closeCh)
cp.releaseAll()
})
}
type Pool struct {
objects chan *Object
mincap int
maxcap int
target string
timeout int64
connectTimeout int64
}
func NewPool(min, max int, timeout, connectTimeout int64, target string) (p *Pool) {
p = new(Pool)
p.mincap = min
p.maxcap = max
p.target = target
p.objects = make(chan *Object, max)
p.timeout = timeout
p.connectTimeout = connectTimeout
p.initAllConnect()
return p
}
func (p *Pool) initAllConnect() {
for i := 0; i < p.mincap; i++ {
c, err := net.Dial("tcp", p.target)
if err == nil {
conn := c.(*net.TCPConn)
conn.SetKeepAlive(true)
conn.SetNoDelay(true)
o := &Object{conn: conn, idle: time.Now().UnixNano()}
p.PutConnectObjectToPool(o)
}
}
}
func (p *Pool) PutConnectObjectToPool(o *Object) {
select {
case p.objects <- o:
return
default:
if o.conn != nil {
o.conn.Close()
}
return
}
}
func (p *Pool) autoRelease() {
connectLen := len(p.objects)
for i := 0; i < connectLen; i++ {
select {
case o := <-p.objects:
if time.Now().UnixNano()-int64(o.idle) > p.timeout {
o.conn.Close()
} else {
p.PutConnectObjectToPool(o)
}
default:
return
}
}
}
func (p *Pool) ReleaseAll() {
connectLen := len(p.objects)
for i := 0; i < connectLen; i++ {
select {
case o := <-p.objects:
o.conn.Close()
default:
return
}
}
}
func (p *Pool) NewConnect(target string) (c *net.TCPConn, err error) {
var connect net.Conn
connect, err = net.DialTimeout("tcp", p.target, time.Duration(p.connectTimeout)*time.Second)
if err == nil {
conn := connect.(*net.TCPConn)
conn.SetKeepAlive(true)
conn.SetNoDelay(true)
c = conn
}
return
}
func (p *Pool) GetConnectFromPool() (c *net.TCPConn, err error) {
var o *Object
for {
select {
case o = <-p.objects:
default:
return p.NewConnect(p.target)
}
if time.Now().UnixNano()-int64(o.idle) > p.timeout {
_ = o.conn.Close()
o = nil
continue
}
return o.conn, nil
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package cryptoutil
import (
"bytes"
"crypto/aes"
"crypto/cipher"
"crypto/hmac"
"crypto/md5"
"crypto/rand"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"encoding/base64"
"encoding/binary"
"fmt"
"io"
rand2 "math/rand"
"net/http"
"strconv"
"time"
"unsafe"
)
func pad(src []byte) []byte {
padding := aes.BlockSize - len(src)%aes.BlockSize
padtext := bytes.Repeat([]byte{byte(padding)}, padding)
return append(src, padtext...)
}
func unpad(src []byte) []byte {
length := len(src)
unpadding := int(src[length-1])
return src[:(length - unpadding)]
}
// AesEncryptCBC defines aes encryption with CBC
func AesEncryptCBC(key, plaintext []byte) (ciphertext []byte, err error) {
var block cipher.Block
if len(plaintext) == 0 {
err = fmt.Errorf("input for encryption is invalid")
return
}
paddedText := pad(plaintext)
if len(paddedText)%aes.BlockSize != 0 {
err = fmt.Errorf("paddedText [len=%d] is not a multiple of the block size", len(paddedText))
return
}
block, err = aes.NewCipher(key)
if err != nil {
return
}
ciphertext = make([]byte, aes.BlockSize+len(paddedText))
iv := ciphertext[:aes.BlockSize]
if _, err = io.ReadFull(rand.Reader, iv); err != nil {
return
}
cbc := cipher.NewCBCEncrypter(block, iv)
cbc.CryptBlocks(ciphertext[aes.BlockSize:], paddedText)
return
}
// AesDecryptCBC defines aes decryption with CBC
func AesDecryptCBC(key, ciphertext []byte) (plaintext []byte, err error) {
var block cipher.Block
if block, err = aes.NewCipher(key); err != nil {
return
}
if len(ciphertext) < aes.BlockSize {
err = fmt.Errorf("ciphertext [len=%d] too short; should greater than blocksize", len(ciphertext))
return
}
iv := ciphertext[:aes.BlockSize]
ciphertext = ciphertext[aes.BlockSize:]
cbc := cipher.NewCBCDecrypter(block, iv)
cbc.CryptBlocks(ciphertext, ciphertext)
plaintext = unpad(ciphertext)
return
}
// GenSecretKey generate a secret key according to pair {ts, id}
func GenSecretKey(key []byte, ts int64, id string) (secretKey []byte) {
b := make([]byte, 8)
binary.LittleEndian.PutUint64(b, uint64(ts))
data := append(b, []byte(id)...)
secretKey = genKey(key, data)
return
}
func genKey(key []byte, data []byte) (sessionKey []byte) {
h := hmac.New(sha256.New, []byte(key))
h.Write([]byte(data))
sessionKey = h.Sum(nil)
return
}
// AuthGenSessionKeyTS authnode generates a session key according to its master key and current timestamp
func AuthGenSessionKeyTS(key []byte) (sessionKey []byte) {
data := []byte(strconv.FormatInt(int64(time.Now().Unix()), 10))
sessionKey = genKey(key, data)
return
}
// Base64Encode encoding using base64
func Base64Encode(text []byte) (encodedText string) {
encodedText = base64.StdEncoding.EncodeToString(text)
return
}
// Base64Decode Decoding using base64
func Base64Decode(encodedText string) (text []byte, err error) {
text, err = base64.StdEncoding.DecodeString(encodedText)
return
}
// EncodeMessage encode a message with aes encrption, md5 signature
func EncodeMessage(plaintext []byte, key []byte) (message string, err error) {
var cipher []byte
if len(plaintext) > MaxAllocSize {
return "too max packet", fmt.Errorf("too max packet len %v", len(plaintext))
}
// 8 for random number; 16 for md5 hash
buffer := make([]byte, RandomNumberSize+CheckSumSize+len(plaintext))
// add random
random := rand2.Uint64()
binary.LittleEndian.PutUint64(buffer[RandomNumberOffset:], random)
// add request body
copy(buffer[MessageOffset:], plaintext)
// calculate and add checksum
checksum := md5.Sum(buffer)
copy(buffer[CheckSumOffset:], checksum[:])
// encryption with aes CBC with keysize of 256-bit
if cipher, err = AesEncryptCBC(key, buffer); err != nil {
return
}
// base64 encoding
message = base64.StdEncoding.EncodeToString(cipher)
return
}
// DecodeMessage decode a message and verify its validity
func DecodeMessage(message string, key []byte) (plaintext []byte, err error) {
var (
cipher []byte
decodedText []byte
)
if cipher, err = base64.StdEncoding.DecodeString(message); err != nil {
return
}
if decodedText, err = AesDecryptCBC(key, cipher); err != nil {
return
}
if len(decodedText) <= MessageMetaDataSize {
err = fmt.Errorf("invalid json format with size [%d] less than message meta data size", len(decodedText))
return
}
msgChecksum := make([]byte, CheckSumSize)
copy(msgChecksum, decodedText[CheckSumOffset:CheckSumOffset+CheckSumSize])
// calculate checksum
filltext := bytes.Repeat([]byte{byte(0)}, CheckSumSize)
copy(decodedText[CheckSumOffset:], filltext[:])
newChecksum := md5.Sum(decodedText)
// verify checksum
if !bytes.Equal(msgChecksum, newChecksum[:]) {
err = fmt.Errorf("checksum not match")
}
plaintext = decodedText[MessageOffset:]
// fmt.Printf("DecodeMessage CBC: %s\n", plaintext)
return
}
// GenVerifier generate a verifier for replay mitigation in http
func GenVerifier(key []byte) (v string, ts int64, err error) {
ts = time.Now().Unix()
tsbuf := make([]byte, unsafe.Sizeof(ts))
binary.LittleEndian.PutUint64(tsbuf, uint64(ts))
if v, err = EncodeMessage(tsbuf, key); err != nil {
panic(err)
}
return
}
// CreateClientX creates a https client
func CreateClientX(cert *[]byte) (client *http.Client, err error) {
caCertPool := x509.NewCertPool()
ok := caCertPool.AppendCertsFromPEM(*cert)
if !ok {
err = fmt.Errorf("CreateClientX AppendCertsFromPEM fails")
return
}
// We don't use PKI to verify client since we have secret key for authentication
client = &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
MinVersion: tls.VersionTLS12,
RootCAs: caCertPool,
InsecureSkipVerify: false,
},
},
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package errors
import (
"fmt"
"path"
"runtime"
"strings"
)
type ErrorTrace struct {
msg string
}
func New(msg string) error {
return &ErrorTrace{msg: msg}
}
func NewError(err error) error {
if err == nil {
return nil
}
_, file, line, _ := runtime.Caller(1)
_, fileName := path.Split(file)
return &ErrorTrace{
msg: fmt.Sprintf("[%v %v] %v", fileName, line, err.Error()),
}
}
func NewErrorf(format string, a ...interface{}) error {
msg := fmt.Sprintf(format, a...)
_, file, line, _ := runtime.Caller(1)
_, fileName := path.Split(file)
return &ErrorTrace{
msg: fmt.Sprintf("[%v %v] %v", fileName, line, msg),
}
}
func (e *ErrorTrace) Error() string {
return e.msg
}
func Trace(err error, format string, a ...interface{}) error {
msg := fmt.Sprintf(format, a...)
_, file, line, _ := runtime.Caller(1)
_, fileName := path.Split(file)
if err == nil {
return &ErrorTrace{
msg: fmt.Sprintf("[%v %v] %v", fileName, line, msg),
}
}
return &ErrorTrace{
msg: fmt.Sprintf("[%v %v] %v :: %v", fileName, line, msg, err),
}
}
func Stack(err error) string {
e, ok := err.(*ErrorTrace)
if !ok {
return err.Error()
}
var msg string
stack := strings.Split(e.msg, "::")
for _, s := range stack {
msg = fmt.Sprintf("%v\n%v", msg, strings.TrimPrefix(s, " "))
}
return msg
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package errors
import (
"runtime"
"sync"
_ "unsafe"
"github.com/brahma-adshonor/gohook"
)
var ErrUnsupportedArch = New("Unsupported arch")
//go:linkname gopanic runtime.gopanic
func gopanic(e interface{})
var panicHook func()
// NOTE: trampoline don't works
var mu sync.Mutex
func hookedPanic(e interface{}) {
mu.Lock()
defer mu.Unlock()
// NOTE: unhook before invoke hook function
gohook.UnHook(gopanic)
defer gohook.Hook(gopanic, hookedPanic, nil)
panicHook()
gopanic(e)
}
func AtPanic(hook func()) error {
if !SupportPanicHook() {
return ErrUnsupportedArch
}
panicHook = hook
return gohook.Hook(gopanic, hookedPanic, nil)
}
var (
oldToken = false
newToken = false
)
//go:noinline
func setOldToken() {
oldToken = true
}
//go:noinline
func setNewToken() {
newToken = true
}
func supportTest() (ok bool) {
err := gohook.Hook(setOldToken, setNewToken, nil)
if err != nil {
return
}
setOldToken()
err = gohook.UnHook(setOldToken)
if err != nil {
return
}
setOldToken()
ok = oldToken && newToken
return
}
func SupportPanicHook() (ok bool) {
switch runtime.GOARCH {
case "amd64", "386":
ok = supportTest()
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"fmt"
"sync"
"github.com/cubefs/cubefs/util/log"
"github.com/cubefs/cubefs/util/ump"
)
var (
AlarmPool = &sync.Pool{New: func() interface{} {
return new(Alarm)
}}
// AlarmGroup sync.Map
AlarmCh chan *Alarm
)
func collectAlarm() {
AlarmCh = make(chan *Alarm, ChSize)
for {
m := <-AlarmCh
AlarmPool.Put(m)
}
}
type Alarm struct {
Counter
}
func Warning(detail string) (a *Alarm) {
key := fmt.Sprintf("%v_%v_warning", clustername, modulename)
ump.Alarm(key, detail)
log.LogCritical(key, detail)
if !enabledPrometheus {
return
}
a = AlarmPool.Get().(*Alarm)
a.name = metricsName(key)
a.Add(1)
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"bytes"
"encoding/json"
"fmt"
"io"
"net"
"net/http"
"regexp"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/log"
)
const (
RegisterPeriod = time.Duration(10) * time.Minute
RegisterPath = "/v1/agent/service/register"
)
/**
* consul register info for prometheus
* optional for user when set prometheus exporter
*/
type ConsulRegisterInfo struct {
Name string `json:"Name"`
ID string `json:"ID"`
Address string `json:"Address"`
Port int64 `json:"Port"`
Tags []string `json:"Tags"`
Meta map[string]string `json:",omitempty"`
}
// get consul id
func GetConsulId(app string, role string, host string, port int64) string {
return fmt.Sprintf("%s_%s_%s_%d", app, role, host, port)
}
// do consul register process
func DoConsulRegisterProc(addr, app, role, cluster, meta, host string, port int64) {
if len(addr) <= 0 {
return
}
log.LogInfof("metrics consul register %v %v %v", addr, cluster, port)
ticker := time.NewTicker(RegisterPeriod)
defer func() {
if err := recover(); err != nil {
log.LogErrorf("RegisterConsul panic,err[%v]", err)
}
ticker.Stop()
}()
client := &http.Client{}
req := makeRegisterReq(host, addr, app, role, cluster, meta, port)
if req == nil {
log.LogErrorf("make register req error")
return
}
if resp, _ := client.Do(req); resp != nil {
io.ReadAll(resp.Body)
resp.Body.Close()
}
for range ticker.C {
req := makeRegisterReq(host, addr, app, role, cluster, meta, port)
if req == nil {
log.LogErrorf("make register req error")
return
}
if resp, _ := client.Do(req); resp != nil {
io.ReadAll(resp.Body)
resp.Body.Close()
}
}
}
// GetLocalIpAddr returns the local IP address.
func GetLocalIpAddr(filter string) (ipaddr string, err error) {
addrs, err := net.InterfaceAddrs()
if err != nil {
log.LogError("consul register get local ip failed, ", err)
return
}
for _, addr := range addrs {
if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() {
if ipnet.IP.To4() != nil {
ip := ipnet.IP.String()
if filter != "" {
match, err := doFilter(filter, ip)
if err != nil {
return "", fmt.Errorf("regex match err, err %s", err.Error())
}
if !match {
continue
}
}
return ip, nil
}
}
}
return "", fmt.Errorf("cannot get local ip")
}
// use ! tag to represent to do negative filter
func doFilter(filter, ip string) (ok bool, err error) {
// negative filter
if strings.HasPrefix(filter, "!") {
filter = filter[1:]
ok, err := regexp.MatchString(filter, ip)
return !ok, err
}
ok, err = regexp.MatchString(filter, ip)
return ok, err
}
// make a consul rest request
func makeRegisterReq(host, addr, app, role, cluster, meta string, port int64) (req *http.Request) {
id := GetConsulId(app, role, host, port)
url := addr + RegisterPath
cInfo := &ConsulRegisterInfo{
Name: app,
ID: id,
Address: host,
Port: port,
Tags: []string{
"app=" + app,
"role=" + role,
"cluster=" + cluster,
},
}
ok, metas := parseMetaStr(meta)
if ok {
cInfo.Meta = metas
cInfo.Meta["cluster"] = cluster
cInfo.Meta["commit"] = proto.CommitID
if len(cInfo.Meta["metric_path"]) == 0 {
cInfo.Meta["metric_path"] = "/metrics"
log.LogInfo("metric_path is empty, use default /metrics")
}
}
cInfoBytes, err := json.Marshal(cInfo)
if err != nil {
log.LogErrorf("marshal error, %v", err.Error())
return nil
}
req, err = http.NewRequest(http.MethodPut, url, bytes.NewBuffer(cInfoBytes))
if err != nil {
log.LogErrorf("new request error, %v", err.Error())
return nil
}
req.Header.Set("Content-Type", "application/json; charset=utf-8")
req.Close = true
return
}
// parse k1=v1;k2=v2 as a map
func parseMetaStr(meta string) (bool, map[string]string) {
if len(meta) == 0 {
log.LogInfo("meta is empty, use default")
meta = "dataset=custom;category=custom;app=cfs;role=fuseclient;metric_path=/metrics"
}
m := map[string]string{}
kvs := strings.Split(meta, ";")
for _, kv := range kvs {
arr := strings.Split(kv, "=")
if len(arr) != 2 {
log.LogInfof("meta is invalid, can't use %s", meta)
return false, m
}
m[arr[0]] = arr[1]
}
return true, m
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"sync"
"github.com/cubefs/cubefs/util/log"
"github.com/prometheus/client_golang/prometheus"
)
var (
CounterGroup sync.Map
CounterPool = &sync.Pool{New: func() interface{} {
return new(Counter)
}}
CounterCh chan *Counter
)
func collectCounter() {
CounterCh = make(chan *Counter, ChSize)
for {
m := <-CounterCh
metric := m.Metric()
metric.Add(float64(m.val))
}
}
type Counter struct {
Gauge
}
func NewCounter(name string) (c *Counter) {
c = new(Counter)
c.name = metricsName(name)
return
}
func (c *Counter) Add(val int64) {
if !enabledPrometheus {
return
}
c.val = float64(val)
c.publish()
}
func (c *Counter) publish() {
select {
case CounterCh <- c:
default:
}
}
func (c *Counter) AddWithLabels(val int64, labels map[string]string) {
if !enabledPrometheus {
return
}
c.labels = labels
c.Add(val)
}
func (c *Counter) Metric() prometheus.Counter {
metric := prometheus.NewCounter(
prometheus.CounterOpts{
Name: c.name,
ConstLabels: c.labels,
})
key := c.Key()
actualMetric, load := CounterGroup.LoadOrStore(key, metric)
if load {
return actualMetric.(prometheus.Counter)
}
if enablePush {
registry.MustRegister(actualMetric.(prometheus.Collector))
return actualMetric.(prometheus.Counter)
}
err := prometheus.Register(actualMetric.(prometheus.Collector))
if err == nil {
log.LogInfo("register metric ", c.name)
}
return actualMetric.(prometheus.Counter)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"fmt"
"net"
"net/http"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/cubefs/cubefs/proto"
"github.com/cubefs/cubefs/util/config"
"github.com/cubefs/cubefs/util/log"
"github.com/gorilla/mux"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/client_golang/prometheus/push"
)
const (
PromHandlerPattern = "/metrics" // prometheus handler
AppName = "cfs" // app name
ConfigKeyExporterEnable = "exporterEnable" // exporter enable
ConfigKeyExporterPort = "exporterPort" // exporter port
ConfigKeyConsulAddr = "consulAddr" // consul addr
ConfigKeyConsulMeta = "consulMeta" // consul meta
ConfigKeyIpFilter = "ipFilter" // add ip filter
ConfigKeyEnablePid = "enablePid" // enable report partition id
ConfigKeyPushAddr = "pushAddr" // enable push data to gateway
ChSize = 1024 * 10 // collect chan size
// monitor label name
Vol = "vol"
Disk = "disk"
PartId = "partid"
Op = "op"
Type = "type"
Err = "err"
)
var (
namespace string
clustername string
modulename string
pushAddr string
exporterPort int64
enabledPrometheus = false
enablePush = false
EnablePid = false
replacer = strings.NewReplacer("-", "_", ".", "_", " ", "_", ",", "_", ":", "_")
registry = prometheus.NewRegistry()
)
func metricsName(name string) string {
return replacer.Replace(fmt.Sprintf("%s_%s", namespace, name))
}
// Init initializes the exporter.
func Init(role string, cfg *config.Config) {
modulename = role
if !cfg.GetBoolWithDefault(ConfigKeyExporterEnable, true) {
log.LogInfof("%v exporter disabled", role)
return
}
EnablePid = cfg.GetBoolWithDefault(ConfigKeyEnablePid, false)
log.LogInfo("enable report partition id info? ", EnablePid)
port := cfg.GetInt64(ConfigKeyExporterPort)
if port < 0 {
log.LogInfof("%v exporter port set random default", port)
}
exporterPort = port
enabledPrometheus = true
pushAddr = cfg.GetString(ConfigKeyPushAddr)
log.LogInfof("pushAddr %v ", pushAddr)
if pushAddr != "" {
enablePush = true
}
http.Handle(PromHandlerPattern, promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
Timeout: 60 * time.Second,
}))
namespace = AppName + "_" + role
addr := fmt.Sprintf(":%d", port)
l, err := net.Listen("tcp", addr)
if err != nil {
log.LogError("exporter tcp listen error: ", err)
return
}
exporterPort = int64(l.Addr().(*net.TCPAddr).Port)
go func() {
err = http.Serve(l, nil)
if err != nil {
log.LogError("exporter http serve error: ", err)
return
}
}()
collect()
m := NewGauge("start_time")
m.Set(float64(time.Now().Unix() * 1000))
log.LogInfof("exporter Start: %v", exporterPort)
}
// Init initializes the exporter.
func InitWithRouter(role string, cfg *config.Config, router *mux.Router, exPort string) {
modulename = role
if !cfg.GetBoolWithDefault(ConfigKeyExporterEnable, true) {
log.LogInfof("%v metrics exporter disabled", role)
return
}
exporterPort, _ = strconv.ParseInt(exPort, 10, 64)
enabledPrometheus = true
router.NewRoute().Name("metrics").
Methods(http.MethodGet).
Path(PromHandlerPattern).
Handler(promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
Timeout: 5 * time.Second,
}))
namespace = AppName + "_" + role
collect()
m := NewGauge("start_time")
m.Set(float64(time.Now().Unix() * 1000))
log.LogInfof("exporter Start: %v %v", exporterPort, m)
}
func RegistConsul(cluster string, role string, cfg *config.Config) {
ipFilter := cfg.GetString(ConfigKeyIpFilter)
host, err := GetLocalIpAddr(ipFilter)
if err != nil {
log.LogErrorf("get local ip error, %v", err.Error())
return
}
rawmnt := cfg.GetString("subdir")
if rawmnt == "" {
rawmnt = "/"
}
mountPoint, _ := filepath.Abs(rawmnt)
log.LogInfof("RegistConsul:%v", enablePush)
if enablePush {
log.LogWarnf("[RegisterConsul] use auto push data strategy, not register consul")
autoPush(pushAddr, role, cluster, host, mountPoint)
return
}
clustername = replacer.Replace(cluster)
consulAddr := cfg.GetString(ConfigKeyConsulAddr)
consulMeta := cfg.GetString(ConfigKeyConsulMeta)
if exporterPort == int64(0) {
exporterPort = cfg.GetInt64(ConfigKeyExporterPort)
}
if exporterPort == 0 {
log.LogInfo("config export port is 0, use default 17510")
exporterPort = 17510
}
if exporterPort != int64(0) && len(consulAddr) > 0 {
if ok := strings.HasPrefix(consulAddr, "http"); !ok {
consulAddr = "http://" + consulAddr
}
go DoConsulRegisterProc(consulAddr, AppName, role, cluster, consulMeta, host, exporterPort)
}
}
func autoPush(pushAddr, role, cluster, ip, mountPoint string) {
pid := os.Getpid()
client := &http.Client{
Timeout: time.Second * 10,
}
hostname, err := os.Hostname()
if err != nil {
log.LogWarnf("get host name failed %v", err)
}
pusher := push.New(pushAddr, "cbfs").
Client(client).
Gatherer(registry).
Grouping("cip", ip).
Grouping("role", role).
Grouping("cluster", cluster).
Grouping("pid", strconv.Itoa(pid)).
Grouping("commit", proto.CommitID).
Grouping("app", AppName).
Grouping("mountPoint", mountPoint).
Grouping("hostName", hostname)
log.LogInfof("start push data, ip %s, addr %s, role %s, cluster %s, mountPoint %s, hostName %s",
ip, pushAddr, role, cluster, mountPoint, hostname)
ticker := time.NewTicker(time.Second * 15)
go func() {
for range ticker.C {
if err := pusher.Push(); err != nil {
log.LogWarnf("push monitor data to %s err, %s", pushAddr, err.Error())
}
}
}()
}
func collect() {
if !enabledPrometheus {
return
}
go collectCounter()
go collectGauge()
go collectHistogram()
go collectAlarm()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"fmt"
"sync"
"github.com/cubefs/cubefs/util/log"
"github.com/prometheus/client_golang/prometheus"
)
var (
GaugeGroup sync.Map
GaugeCh chan *Gauge
)
func collectGauge() {
GaugeCh = make(chan *Gauge, ChSize)
for {
m := <-GaugeCh
metric := m.Metric()
metric.Set(m.val)
// log.LogDebugf("collect metric %v", m)
}
}
type Gauge struct {
name string
labels map[string]string
val float64
}
func NewGauge(name string) (g *Gauge) {
g = new(Gauge)
g.name = metricsName(name)
return
}
func (c *Gauge) Key() (key string) {
return stringMD5(c.Name())
}
func (g *Gauge) Name() string {
return fmt.Sprintf("{%s: %s}", g.name, stringMapToString(g.labels))
}
func (g *Gauge) String() string {
return fmt.Sprintf("{name: %s, labels: %s, val: %v}", g.name, stringMapToString(g.labels), g.val)
}
func (c *Gauge) Metric() prometheus.Gauge {
metric := prometheus.NewGauge(
prometheus.GaugeOpts{
Name: c.name,
ConstLabels: c.labels,
})
key := c.Key()
actualMetric, load := GaugeGroup.LoadOrStore(key, metric)
if load {
return actualMetric.(prometheus.Gauge)
}
if enablePush {
registry.MustRegister(actualMetric.(prometheus.Collector))
return actualMetric.(prometheus.Gauge)
}
err := prometheus.Register(actualMetric.(prometheus.Collector))
if err == nil {
log.LogInfof("register metric %v", c.Name())
} else {
log.LogErrorf("register metric %v, %v", c.Name(), err)
}
return actualMetric.(prometheus.Gauge)
}
func (g *Gauge) Set(val float64) {
if !enabledPrometheus {
return
}
g.val = val
g.publish()
}
func (c *Gauge) publish() {
select {
case GaugeCh <- c:
default:
}
}
func (g *Gauge) SetWithLabels(val float64, labels map[string]string) {
if !enabledPrometheus {
return
}
g.labels = labels
g.Set(val)
}
type GaugeVec struct {
*prometheus.GaugeVec
}
func NewGaugeVec(name, help string, labels []string) *GaugeVec {
if !enabledPrometheus {
return nil
}
v := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: metricsName(name),
Help: help,
},
labels,
)
if err := prometheus.Register(v); err != nil {
log.LogErrorf("prometheus register gaugevec name:%v, labels:{%v} error: %v", name, labels, err)
return nil
}
return &GaugeVec{GaugeVec: v}
}
func (v *GaugeVec) SetWithLabelValues(val float64, lvs ...string) {
if m, err := v.GetMetricWithLabelValues(lvs...); err == nil {
m.Set(val)
}
}
func (v *GaugeVec) SetBoolWithLabelValues(val bool, lvs ...string) {
if val {
v.SetWithLabelValues(float64(1), lvs...)
} else {
v.SetWithLabelValues(0, lvs...)
}
}
package exporter
import (
"fmt"
"sync"
"github.com/cubefs/cubefs/util/log"
"github.com/prometheus/client_golang/prometheus"
)
var (
// us 1us, 100us, 500us, 1ms, 5ms, 50ms, 200ms, 500ms, 1s, 3s
buckets = []float64{1, 50, 250, 500, 2500, 5000, 25000, 50000, 250000, 500000, 2500000, 5000000}
HistogramGroup sync.Map
HistogramCh chan *Histogram
once = sync.Once{}
)
func collectHistogram() {
HistogramCh = make(chan *Histogram, ChSize)
for {
m := <-HistogramCh
metric := m.Metric()
metric.Observe(m.val / 1000)
}
}
type Histogram struct {
name string
labels map[string]string
val float64
}
func (c *Histogram) Key() (key string) {
return stringMD5(c.Name())
}
func (g *Histogram) Name() string {
return fmt.Sprintf("{%s: %s}", g.name, stringMapToString(g.labels))
}
func (g *Histogram) String() string {
return fmt.Sprintf("{name: %s, labels: %s, val: %v}", g.name, stringMapToString(g.labels), g.val)
}
func (c *Histogram) Metric() prometheus.Histogram {
if enablePush {
once.Do(func() {
buckets = []float64{1, 300, 1000, 5000, 500000, 2500000}
})
}
metric := prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: c.name,
ConstLabels: c.labels,
Buckets: buckets,
})
key := c.Key()
actualMetric, load := HistogramGroup.LoadOrStore(key, metric)
if load {
return actualMetric.(prometheus.Histogram)
}
if enablePush {
registry.MustRegister(actualMetric.(prometheus.Collector))
return actualMetric.(prometheus.Histogram)
}
err := prometheus.Register(actualMetric.(prometheus.Collector))
if err == nil {
log.LogInfof("register metric %v", c.Name())
} else {
log.LogErrorf("register metric %v, %v", c.Name(), err)
}
return actualMetric.(prometheus.Histogram)
}
func (h *Histogram) publish() {
select {
case HistogramCh <- h:
default:
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"fmt"
"time"
"github.com/cubefs/cubefs/util/ump"
)
type TimePoint struct {
Histogram
startTime time.Time
}
func NewTP(name string) (tp *TimePoint) {
tp = new(TimePoint)
tp.name = fmt.Sprintf("%s_hist", metricsName(name))
tp.labels = make(map[string]string)
tp.val = 0
tp.startTime = time.Now()
return
}
func (tp *TimePoint) Set() {
if !enabledPrometheus {
return
}
val := time.Since(tp.startTime).Nanoseconds()
tp.val = float64(val)
tp.publish()
}
func (tp *TimePoint) SetWithLabels(labels map[string]string) {
if !enabledPrometheus {
return
}
tp.labels = labels
tp.Set()
}
func (tp *TimePoint) GetStartTime() time.Time {
return tp.startTime
}
type TimePointCount struct {
tp *TimePoint
cnt *Counter
to *ump.TpObject
}
func NewTPCnt(name string) (tpc *TimePointCount) {
tpc = new(TimePointCount)
tpc.to = ump.BeforeTP(fmt.Sprintf("%v_%v_%v", clustername, modulename, name))
tpc.tp = NewTP(name)
tpc.cnt = NewCounter(fmt.Sprintf("%s_count", name))
return
}
// it should be invoked by defer func{set(err)}
func (tpc *TimePointCount) Set(err error) {
ump.AfterTP(tpc.to, err)
tpc.tp.Set()
tpc.cnt.Add(1)
}
func (tpc *TimePointCount) SetWithLabels(err error, labels map[string]string) {
ump.AfterTP(tpc.to, err)
if !enabledPrometheus {
return
}
tpc.tp.SetWithLabels(labels)
tpc.cnt.AddWithLabels(1, labels)
}
func (tpc *TimePointCount) GetStartTime() time.Time {
return tpc.tp.GetStartTime()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package exporter
import (
"crypto/md5"
"encoding/json"
"fmt"
"io"
)
func stringMD5(str string) string {
h := md5.New()
_, err := io.WriteString(h, str)
if err != nil {
return ""
}
return fmt.Sprintf("%x", h.Sum(nil))
}
func stringMapToString(m map[string]string) string {
mjson, err := json.Marshal(m)
if err != nil {
return "{}"
}
return string(mjson)
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fileutil
import "os"
func Exist(path string) bool {
_, err := os.Stat(path)
return err == nil || !os.IsNotExist(err)
}
func ExistDir(path string) bool {
state, err := os.Stat(path)
if err == nil || !os.IsNotExist(err) {
return state.IsDir()
}
return false
}
// Copyright 2024 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fileutil
import (
"os"
"syscall"
)
func Stat(name string) (stat *syscall.Stat_t, err error) {
info, err := os.Stat(name)
if err != nil {
return
}
stat = info.Sys().(*syscall.Stat_t)
return
}
// Copyright 2024 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package fileutil
import "syscall"
type FilesystemInfo = syscall.Statfs_t
func Statfs(name string) (stat *FilesystemInfo, err error) {
stat = &FilesystemInfo{}
err = syscall.Statfs(name, stat)
return
}
package util
type respErr struct {
errCh chan error
}
func (e *respErr) init() {
e.errCh = make(chan error, 1)
}
func (e *respErr) respond(err error) {
e.errCh <- err
close(e.errCh)
}
func (e *respErr) error() <-chan error {
return e.errCh
}
// Future the future
type Future struct {
respErr
respCh chan interface{}
}
func NewFuture() *Future {
f := &Future{
respCh: make(chan interface{}, 1),
}
f.init()
return f
}
func (f *Future) Respond(resp interface{}, err error) {
if err == nil {
f.respCh <- resp
close(f.respCh)
} else {
f.respErr.respond(err)
}
}
// Response wait response
func (f *Future) Response() (resp interface{}, err error) {
select {
case err = <-f.error():
return
case resp = <-f.respCh:
return
}
}
// AsyncResponse export channels
func (f *Future) AsyncResponse() (respCh <-chan interface{}, errCh <-chan error) {
return f.respCh, f.errCh
}
// Copyright 2015 The Go Authors. All rights reserved.
//
// Modified by 2020 The CubeFS Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Minimal RFC 6724 address selection.
package iputil
import (
"net"
)
// commonPrefixLen reports the length of the longest prefix (looking
// at the most significant, or leftmost, bits) that the
// two addresses have in common, up to the length of a's prefix (i.e.,
// the portion of the address not including the interface ID).
//
// If a or b is an IPv4 address as an IPv6 address, the IPv4 addresses
// are compared (with max common prefix length of 32).
// If a and b are different IP versions, 0 is returned.
//
// See https://tools.ietf.org/html/rfc6724#section-2.2
func commonPrefixLen(a, b net.IP) (cpl int) {
if a4 := a.To4(); a4 != nil {
a = a4
}
if b4 := b.To4(); b4 != nil {
b = b4
}
if len(a) != len(b) {
return 0
}
// If IPv6, only up to the prefix (first 64 bits)
if len(a) > 8 {
a = a[:8]
b = b[:8]
}
for len(a) > 0 {
if a[0] == b[0] {
cpl += 8
a = a[1:]
b = b[1:]
continue
}
bits := 8
ab, bb := a[0], b[0]
for {
ab >>= 1
bb >>= 1
bits--
if ab == bb {
cpl += bits
return
}
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package iputil
import (
"errors"
"net"
"net/http"
"strings"
)
var cidrs []*net.IPNet
func init() {
maxCidrBlocks := []string{
"127.0.0.1/8", // localhost
"10.0.0.0/8", // 24-bit block
"172.16.0.0/12", // 20-bit block
"192.168.0.0/16", // 16-bit block
"169.254.0.0/16", // link local address
"::1/128", // localhost IPv6
"fc00::/7", // unique local address IPv6
"fe80::/10", // link local address IPv6
}
cidrs = make([]*net.IPNet, len(maxCidrBlocks))
for i, maxCidrBlock := range maxCidrBlocks {
_, cidr, _ := net.ParseCIDR(maxCidrBlock)
cidrs[i] = cidr
}
}
// isLocalAddress works by checking if the address is under private CIDR blocks.
// List of private CIDR blocks can be seen on :
//
// https://en.wikipedia.org/wiki/Private_network
//
// https://en.wikipedia.org/wiki/Link-local_address
func isPrivateAddress(address string) (bool, error) {
ipAddress := net.ParseIP(address)
if ipAddress == nil {
return false, errors.New("address is not valid")
}
for i := range cidrs {
if cidrs[i].Contains(ipAddress) {
return true, nil
}
}
return false, nil
}
// FromRequest return client's real public IP address from http request headers.
func FromRequest(r *http.Request) string {
// Fetch header value
xRealIP := r.Header.Get("X-Real-Ip")
xForwardedFor := r.Header.Get("X-Forwarded-For")
// If both empty, return IP from remote address
if xRealIP == "" && xForwardedFor == "" {
var remoteIP string
// If there are colon in remote address, remove the port number
// otherwise, return remote address as is
if strings.ContainsRune(r.RemoteAddr, ':') {
remoteIP, _, _ = net.SplitHostPort(r.RemoteAddr)
} else {
remoteIP = r.RemoteAddr
}
return remoteIP
}
// Check list of IP in X-Forwarded-For and return the first global address
for _, address := range strings.Split(xForwardedFor, ",") {
address = strings.TrimSpace(address)
isPrivate, err := isPrivateAddress(address)
if !isPrivate && err == nil {
return address
}
}
// If nothing succeed, return X-Real-IP
return xRealIP
}
// RealIP is depreciated, use FromRequest instead
func RealIP(r *http.Request) string {
return FromRequest(r)
}
// set default max distance from two ips to length of ipv6
const DEFAULT_MAX_DISTANCE = 128
func GetDistance(a, b net.IP) int {
return DEFAULT_MAX_DISTANCE - commonPrefixLen(a, b)
}
package keystore
import (
"encoding/json"
"fmt"
"regexp"
"github.com/cubefs/cubefs/util/caps"
)
type AccessKeyInfo struct {
AccessKey string `json:"access_key"`
ID string `json:"id"`
}
type AccessKeyCaps struct {
AccessKey string `json:"access_key"`
SecretKey string `json:"secret_key"`
Caps []byte `json:"caps"`
ID string `json:"user_id"`
}
func (u *AccessKeyCaps) IsValidCaps() (err error) {
cap := new(caps.Caps)
if err = cap.Init(u.Caps); err != nil {
err = fmt.Errorf("Invalid caps [%s] %s", u.Caps, err.Error())
}
return
}
func (u *AccessKeyCaps) IsValidAK() (err error) {
re := regexp.MustCompile("^[A-Za-z0-9]{16}$")
if !re.MatchString(u.AccessKey) {
err = fmt.Errorf("invalid AccessKey [%s]", u.AccessKey)
return
}
return
}
func (u *AccessKeyCaps) DumpJSONStr() (r string, err error) {
dumpInfo := struct {
AccessKey string `json:"access_key"`
SecretKey string `json:"secret_key"`
Caps string `json:"caps"`
ID string `json:"id"`
}{
u.AccessKey,
u.SecretKey,
string(u.Caps),
u.ID,
}
data, err := json.MarshalIndent(dumpInfo, "", " ")
if err != nil {
return
}
r = string(data)
return
}
package keystore
import (
"encoding/json"
"fmt"
"io"
"os"
"regexp"
"github.com/cubefs/cubefs/util/caps"
)
var roleSet = map[string]bool{
"client": true,
"service": true,
}
// KeyInfo defines the key info structure in key store
type KeyInfo struct {
ID string `json:"id"`
AuthKey []byte `json:"auth_key"`
AccessKey string `json:"access_key"`
SecretKey string `json:"secret_key"`
Ts int64 `json:"create_ts"`
Role string `json:"role"`
Caps []byte `json:"caps"`
}
// DumpJSONFile dump KeyInfo to file in json format
func (u *KeyInfo) DumpJSONFile(filename string, authIdKey string) (err error) {
var data string
if data, err = u.DumpJSONStr(authIdKey); err != nil {
return
}
file, err := os.Create(filename)
if err != nil {
return
}
defer file.Close()
_, err = io.WriteString(file, data)
if err != nil {
return
}
return
}
// DumpJSONStr dump KeyInfo to string in json format
func (u *KeyInfo) DumpJSONStr(authIdKey string) (r string, err error) {
dumpInfo := struct {
ID string `json:"id"`
AuthKey []byte `json:"auth_key"`
AccessKey string `json:"access_key"`
SecretKey string `json:"secret_key"`
Ts int64 `json:"create_ts"`
Role string `json:"role"`
Caps string `json:"caps"`
AuthIdKey string `json:"auth_id_key"`
}{
u.ID,
u.AuthKey,
u.AccessKey,
u.SecretKey,
u.Ts,
u.Role,
string(u.Caps),
authIdKey,
}
data, err := json.MarshalIndent(dumpInfo, "", " ")
if err != nil {
return
}
r = string(data)
return
}
// IsValidID check the validity of ID
func (u *KeyInfo) IsValidID() (err error) {
re := regexp.MustCompile("^[A-Za-z]{1,1}[A-Za-z0-9_]{0,20}$")
if !re.MatchString(u.ID) {
err = fmt.Errorf("invalid ID [%s]", u.ID)
return
}
return
}
// IsValidRole check the validity of role
func (u *KeyInfo) IsValidRole() (err error) {
if _, ok := roleSet[u.Role]; !ok {
err = fmt.Errorf("invalid Role [%s]", u.Role)
return
}
return
}
// IsValidCaps check the validity of caps
func (u *KeyInfo) IsValidCaps() (err error) {
cap := new(caps.Caps)
if err = cap.Init(u.Caps); err != nil {
err = fmt.Errorf("Invalid caps [%s] %s", u.Caps, err.Error())
}
return
}
// IsValidKeyInfo is a valid of KeyInfo
func (u *KeyInfo) IsValidKeyInfo() (err error) {
if err = u.IsValidID(); err != nil {
return
}
if err = u.IsValidRole(); err != nil {
return
}
if err = u.IsValidCaps(); err != nil {
return
}
return
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package loadutil
import (
"fmt"
"time"
"github.com/shirou/gopsutil/cpu"
"github.com/cubefs/cubefs/util/log"
)
func GetCpuUtilPercent(sampleDuration time.Duration) (used float64, err error) {
utils, err := cpu.Percent(sampleDuration, false)
if err != nil {
log.LogErrorf("[GetCpuUtilPercent] err: %v", err.Error())
return
}
if utils == nil {
err = fmt.Errorf("got nil result")
log.LogErrorf("[GetCpuUtilPercent] err: %v", err.Error())
return
}
if len(utils) == 0 {
err = fmt.Errorf("got result len is 0")
log.LogErrorf("[GetCpuUtilPercent] err: %v", err.Error())
return
}
used = utils[0]
return
}
// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package loadutil
import (
"errors"
"fmt"
"time"
"github.com/shirou/gopsutil/disk"
)
func getMatchCount(lhs string, rhs string) int {
count := len(lhs)
if count > len(rhs) {
count = len(rhs)
}
for i := 0; i < count; i++ {
if lhs[i] != rhs[i] {
return i + 1
}
}
return count
}
func GetMatchParation(path string) (*disk.PartitionStat, error) {
partitons, err := disk.Partitions(true)
if err != nil {
return nil, err
}
maxMatch := 0
matchParation := disk.PartitionStat{}
for _, partition := range partitons {
match := getMatchCount(path, partition.Mountpoint)
if match == len(partition.Mountpoint) && match > maxMatch {
matchParation = partition
}
}
return &matchParation, nil
}
var (
ErrInvalidDiskPartition = errors.New("invalid disk partiton")
ErrFailedToGetIoCounter = errors.New("failed to get io counter")
)
func getDeviceNameFromPartition(partition *disk.PartitionStat) (string, error) {
var name string
if n, err := fmt.Sscanf(partition.Device, "/dev/%s", &name); n != 1 || err != nil {
return "", ErrInvalidDiskPartition
}
return name, nil
}
func GetIoCounter(partition *disk.PartitionStat) (*disk.IOCountersStat, error) {
name, err := getDeviceNameFromPartition(partition)
if err != nil {
return nil, err
}
counters, err := disk.IOCounters(name)
if err != nil {
return nil, err
}
counter, exist := counters[name]
if !exist {
return nil, ErrFailedToGetIoCounter
}
return &counter, nil
}
type DiskIoSampleItem struct {
time time.Time
ioCounter *disk.IOCountersStat
}
func getDiskIoSampleItem(partition *disk.PartitionStat) (*DiskIoSampleItem, error) {
ioCounter, err := GetIoCounter(partition)
if err != nil {
return nil, err
}
time := time.Now()
return &DiskIoSampleItem{
time: time,
ioCounter: ioCounter,
}, nil
}
func getReadFlow(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
t := second.time.Sub(first.time)
ms := uint64(t.Milliseconds())
bytes := second.ioCounter.ReadBytes - first.ioCounter.ReadBytes
return bytes * 1000 / ms
}
func getWriteFlow(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
t := second.time.Sub(first.time)
ms := uint64(t.Milliseconds())
bytes := second.ioCounter.WriteBytes - first.ioCounter.WriteBytes
return bytes * 1000 / ms
}
func getIoCount(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
count := second.ioCounter.ReadCount - first.ioCounter.ReadCount + second.ioCounter.WriteCount - first.ioCounter.WriteCount
return count
}
func getTotalReadWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
count := second.ioCounter.ReadTime - first.ioCounter.ReadTime
return count
}
func getTotalWriteWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
count := second.ioCounter.WriteTime - first.ioCounter.WriteTime
return count
}
func getIoTotalWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
count := second.ioCounter.IoTime - first.ioCounter.IoTime
return count
}
func getIoTotalWeightedWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
count := second.ioCounter.WeightedIO - first.ioCounter.WeightedIO
return count
}
type DiskIoSample struct {
partition *disk.PartitionStat
firstItem *DiskIoSampleItem
secondItem *DiskIoSampleItem
}
func (sample *DiskIoSample) GetReadCount() uint64 {
return sample.secondItem.ioCounter.ReadCount - sample.firstItem.ioCounter.ReadCount
}
func (sample *DiskIoSample) GetReadFlow() uint64 {
return getReadFlow(sample.firstItem, sample.secondItem)
}
func (sample *DiskIoSample) GetReadBytes() uint64 {
return sample.secondItem.ioCounter.ReadBytes - sample.firstItem.ioCounter.ReadBytes
}
func (sample *DiskIoSample) GetReadTotalWaitTime() time.Duration {
return time.Duration(getTotalReadWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}
func (sample *DiskIoSample) GetReadAvgWaitTime() time.Duration {
if sample.GetReadCount() == 0 {
return 0
}
return sample.GetReadTotalWaitTime() / time.Duration(sample.GetReadCount())
}
func (sample *DiskIoSample) GetMergedReadCount() uint64 {
return sample.secondItem.ioCounter.MergedReadCount - sample.firstItem.ioCounter.MergedReadCount
}
func (sample *DiskIoSample) GetWriteCount() uint64 {
return sample.secondItem.ioCounter.WriteCount - sample.firstItem.ioCounter.WriteCount
}
func (sample *DiskIoSample) GetWriteFlow() uint64 {
return getWriteFlow(sample.firstItem, sample.secondItem)
}
func (sample *DiskIoSample) GetWriteBytes() uint64 {
return sample.secondItem.ioCounter.WriteBytes - sample.firstItem.ioCounter.WriteBytes
}
func (sample *DiskIoSample) GetWriteTotalWaitTime() time.Duration {
return time.Duration(getTotalWriteWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}
func (sample *DiskIoSample) GetWriteAvgWaitTime() time.Duration {
if sample.GetWriteCount() == 0 {
return 0
}
return sample.GetWriteTotalWaitTime() / time.Duration(sample.GetWriteCount())
}
func (sample *DiskIoSample) GetMergedWriteCount() uint64 {
return sample.secondItem.ioCounter.MergedWriteCount - sample.firstItem.ioCounter.MergedWriteCount
}
func (sample *DiskIoSample) GetIoCount() uint64 {
return getIoCount(sample.firstItem, sample.secondItem)
}
func (sample *DiskIoSample) GetIoTotalWaitTime() time.Duration {
return time.Duration(getIoTotalWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}
func (sample *DiskIoSample) GetIoAvgWaitTime() time.Duration {
if sample.GetIoCount() == 0 {
return 0
}
return sample.GetIoTotalWaitTime() / time.Duration(sample.GetIoCount())
}
func (sample *DiskIoSample) GetWeightedTotalWaitTime() time.Duration {
return time.Duration(getIoTotalWeightedWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}
func (sample *DiskIoSample) GetWeightedAvgWaitTime() time.Duration {
if sample.GetIoCount() == 0 {
return 0
}
return sample.GetWeightedTotalWaitTime() / time.Duration(sample.GetIoCount())
}
func (sample *DiskIoSample) GetIopsInProgress() uint64 {
return sample.secondItem.ioCounter.IopsInProgress
}
func (sample *DiskIoSample) GetIoUtilPercent() float64 {
return float64(sample.GetIoTotalWaitTime()) / float64(sample.GetSampleDuration()) * 100
}
func (sample *DiskIoSample) GetSampleDuration() time.Duration {
return sample.secondItem.time.Sub(sample.firstItem.time)
}
func (sample *DiskIoSample) GetPartition() *disk.PartitionStat {
return sample.partition
}
func GetDiskIoSample(partition *disk.PartitionStat, duration time.Duration) (DiskIoSample, error) {
var sample DiskIoSample
first, err := getDiskIoSampleItem(partition)
if err != nil {
return sample, err
}
time.Sleep(duration)
second, err := getDiskIoSampleItem(partition)
if err != nil {
return sample, err
}
sample.partition = partition
sample.firstItem = first
sample.secondItem = second
return sample, nil
}
func GetDisksIoSample(partitions []*disk.PartitionStat, duration time.Duration) (map[string]DiskIoSample, error) {
count := len(partitions)
samples := make(map[string]DiskIoSample)
if count != 0 {
firstItems := make([]*DiskIoSampleItem, 0, count)
for i := 0; i < count; i++ {
first, err := getDiskIoSampleItem(partitions[i])
if err != nil {
return nil, err
}
firstItems = append(firstItems, first)
}
time.Sleep(duration)
for i := 0; i < count; i++ {
var sample DiskIoSample
first := firstItems[i]
second, err := getDiskIoSampleItem(partitions[i])
if err != nil {
return nil, err
}
sample.partition = partitions[i]
sample.firstItem = first
sample.secondItem = second
samples[partitions[i].Device] = sample
}
}
return samples, nil
}
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package loadutil
import "github.com/shirou/gopsutil/mem"
func GetMemUsedPercent() float64 {
memInfo, _ := mem.VirtualMemory()
return memInfo.UsedPercent
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package log
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"log"
"math"
"net/http"
"os"
"path"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
blog "github.com/cubefs/cubefs/blobstore/util/log"
syslog "log"
)
type Level uint8
const (
DebugLevel Level = 1
InfoLevel = DebugLevel<<1 + 1
WarnLevel = InfoLevel<<1 + 1
ErrorLevel = WarnLevel<<1 + 1
FatalLevel = ErrorLevel<<1 + 1
CriticalLevel = FatalLevel << +1
ReadLevel = InfoLevel
UpdateLevel = InfoLevel
)
const (
FileNameDateFormat = "20060102150405"
FileOpt = os.O_RDWR | os.O_CREATE | os.O_APPEND
WriterBufferInitSize = 4 * 1024 * 1024
WriterBufferLenLimit = 4 * 1024 * 1024
DefaultRotateInterval = 1 * time.Second
RotatedExtension = ".old"
MaxReservedDays = 7 * 24 * time.Hour
)
var levelPrefixes = []string{
"[DEBUG]",
"[INFO ]",
"[WARN ]",
"[ERROR]",
"[FATAL]",
"[READ ]",
"[WRITE]",
"[Critical]",
}
type RotatedFile []os.FileInfo
func (f RotatedFile) Less(i, j int) bool {
return f[i].ModTime().Before(f[j].ModTime())
}
func (f RotatedFile) Len() int {
return len(f)
}
func (f RotatedFile) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}
func setBlobLogLevel(loglevel Level) {
blevel := blog.Lwarn
switch loglevel {
case DebugLevel:
blevel = blog.Ldebug
case InfoLevel:
blevel = blog.Linfo
case WarnLevel:
blevel = blog.Lwarn
case ErrorLevel:
blevel = blog.Lerror
default:
blevel = blog.Lwarn
}
blog.SetOutputLevel(blevel)
}
type asyncWriter struct {
file *os.File
fileName string
logSize int64
rotateSize int64
buffer *bytes.Buffer
flushTmp *bytes.Buffer
flushC chan bool
rotateDay chan struct{} // TODO rotateTime?
mu sync.Mutex
rotateMu sync.Mutex
}
func (writer *asyncWriter) flushScheduler() {
ticker := time.NewTicker(1 * time.Second)
for {
select {
case <-ticker.C:
writer.flushToFile()
case _, open := <-writer.flushC:
writer.flushToFile()
if !open {
ticker.Stop()
// TODO Unhandled errors
writer.file.Close()
return
}
}
}
}
// Write writes the log.
func (writer *asyncWriter) Write(p []byte) (n int, err error) {
writer.mu.Lock()
writer.buffer.Write(p)
writer.mu.Unlock()
if writer.buffer.Len() > WriterBufferLenLimit {
select {
case writer.flushC <- true:
default:
}
}
return
}
// Close closes the writer.
func (writer *asyncWriter) Close() (err error) {
writer.mu.Lock()
defer writer.mu.Unlock()
close(writer.flushC)
return
}
// Flush flushes the write.
func (writer *asyncWriter) Flush() {
writer.flushToFile()
// TODO Unhandled errors
writer.file.Sync()
}
func (writer *asyncWriter) flushToFile() {
writer.mu.Lock()
writer.buffer, writer.flushTmp = writer.flushTmp, writer.buffer
writer.mu.Unlock()
isRotateDay := false
select {
case <-writer.rotateDay:
isRotateDay = true
default:
}
flushLength := writer.flushTmp.Len()
writer.rotateMu.Lock()
if (writer.logSize+int64(flushLength)) >= writer.
rotateSize || isRotateDay {
oldFile := writer.fileName + "." + time.Now().Format(
FileNameDateFormat) + RotatedExtension
if _, err := os.Lstat(oldFile); err != nil {
if err := writer.rename(oldFile); err == nil {
if fp, err := os.OpenFile(writer.fileName, FileOpt, 0o666); err == nil {
writer.file.Close()
writer.file = fp
writer.logSize = 0
_ = os.Chmod(writer.fileName, 0o666)
} else {
syslog.Printf("log rotate: openFile %v error: %v", writer.fileName, err)
}
} else {
syslog.Printf("log rotate: rename %v error: %v ", oldFile, err)
}
} else {
syslog.Printf("log rotate: lstat error: %v already exists", oldFile)
}
}
writer.rotateMu.Unlock()
writer.logSize += int64(flushLength)
// TODO Unhandled errors
writer.file.Write(writer.flushTmp.Bytes())
writer.flushTmp.Reset()
}
func (writer *asyncWriter) rename(newName string) error {
if err := os.Rename(writer.fileName, newName); err != nil {
return err
}
return nil
}
func newAsyncWriter(fileName string, rotateSize int64) (*asyncWriter, error) {
fp, err := os.OpenFile(fileName, FileOpt, 0o666)
if err != nil {
return nil, err
}
fInfo, err := fp.Stat()
if err != nil {
return nil, err
}
_ = os.Chmod(fileName, 0o666)
w := &asyncWriter{
file: fp,
fileName: fileName,
rotateSize: rotateSize,
logSize: fInfo.Size(),
buffer: bytes.NewBuffer(make([]byte, 0, WriterBufferInitSize)),
flushTmp: bytes.NewBuffer(make([]byte, 0, WriterBufferInitSize)),
flushC: make(chan bool, 1000),
rotateDay: make(chan struct{}, 1),
}
go w.flushScheduler()
return w, nil
}
// LogObject defines the log object.
type LogObject struct {
*log.Logger
object *asyncWriter
}
// Flush flushes the log object.
func (ob *LogObject) Flush() {
if ob.object != nil {
ob.object.Flush()
}
}
func (ob *LogObject) SetRotation() {
ob.object.rotateDay <- struct{}{}
}
func newLogObject(writer *asyncWriter, prefix string, flag int) *LogObject {
return &LogObject{
Logger: log.New(writer, prefix, flag),
object: writer,
}
}
// Log defines the log struct.
type Log struct {
dir string
errorLogger *LogObject
warnLogger *LogObject
debugLogger *LogObject
infoLogger *LogObject
readLogger *LogObject
updateLogger *LogObject
criticalLogger *LogObject
qosLogger *LogObject
level Level
rotate *LogRotate
lastRolledTime time.Time
printStderr int32
}
var (
ErrLogFileName = "_error.log"
WarnLogFileName = "_warn.log"
InfoLogFileName = "_info.log"
DebugLogFileName = "_debug.log"
ReadLogFileName = "_read.log"
UpdateLogFileName = "_write.log"
CriticalLogFileName = "_critical.log"
QoSLogFileName = "_qos.log"
)
var gLog *Log = nil
var LogDir string
func (l *Log) DisableStderrOutput() {
atomic.StoreInt32(&l.printStderr, 0)
}
func (l *Log) outputStderr(calldepth int, s string) {
if atomic.LoadInt32(&l.printStderr) != 0 {
log.Output(calldepth+1, s)
}
}
// InitLog initializes the log.
func InitLog(dir, module string, level Level, rotate *LogRotate, logLeftSpaceLimit int64) (*Log, error) {
l := new(Log)
l.printStderr = 1
dir = path.Join(dir, module)
l.dir = dir
LogDir = dir
fi, err := os.Stat(dir)
if err != nil {
os.MkdirAll(dir, 0o755)
} else {
if !fi.IsDir() {
return nil, errors.New(dir + " is not a directory")
}
}
_ = os.Chmod(dir, 0o755)
fs := syscall.Statfs_t{}
if err := syscall.Statfs(dir, &fs); err != nil {
return nil, fmt.Errorf("[InitLog] stats disk space: %s", err.Error())
}
if rotate == nil {
rotate = NewLogRotate()
}
if rotate.headRoom == 0 {
var minLogLeftSpaceLimit float64
if float64(fs.Bavail*uint64(fs.Bsize)) < float64(fs.Blocks*uint64(fs.Bsize))*DefaultHeadRatio {
minLogLeftSpaceLimit = float64(fs.Bavail*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
} else {
minLogLeftSpaceLimit = float64(fs.Blocks*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
}
minLogLeftSpaceLimit = math.Max(minLogLeftSpaceLimit, float64(logLeftSpaceLimit))
rotate.SetHeadRoomMb(int64(math.Min(minLogLeftSpaceLimit, DefaultHeadRoom)))
}
if rotate.rotateSize == 0 {
minRotateSize := int64(fs.Bavail * uint64(fs.Bsize) / uint64(len(levelPrefixes)))
if minRotateSize < DefaultMinRotateSize {
minRotateSize = DefaultMinRotateSize
}
rotate.SetRotateSizeMb(int64(math.Min(float64(minRotateSize), float64(DefaultRotateSize))))
}
l.rotate = rotate
err = l.initLog(dir, module, level)
if err != nil {
return nil, err
}
l.lastRolledTime = time.Now()
go l.checkLogRotation(dir, module)
gLog = l
setBlobLogLevel(level)
return l, nil
}
func TruncMsg(msg string) string {
return TruncMsgWith(msg, 100)
}
func TruncMsgWith(msg string, size int) string {
if len(msg) < size {
return msg
}
return msg[0:size]
}
func OutputPid(logDir, role string) error {
pidFile := path.Join(logDir, fmt.Sprintf("%s.pid", role))
file, err := os.Create(pidFile)
if err != nil {
return fmt.Errorf("open pid file %s error %s", pidFile, err.Error())
}
pid := os.Getpid()
_, err = file.Write([]byte(fmt.Sprintf("%d", pid)))
if err != nil {
return fmt.Errorf("write pid failed, pid %d, file %s, err %s", pid, pidFile, err.Error())
}
file.Close()
return nil
}
func (l *Log) initLog(logDir, module string, level Level) error {
logOpt := log.LstdFlags | log.Lmicroseconds
newLog := func(logFileName string) (newLogger *LogObject, err error) {
logName := path.Join(logDir, module+logFileName)
w, err := newAsyncWriter(logName, l.rotate.rotateSize)
if err != nil {
return
}
newLogger = newLogObject(w, "", logOpt)
return
}
var err error
logHandles := [...]**LogObject{&l.debugLogger, &l.infoLogger, &l.warnLogger, &l.errorLogger, &l.readLogger, &l.updateLogger, &l.criticalLogger, &l.qosLogger}
logNames := [...]string{DebugLogFileName, InfoLogFileName, WarnLogFileName, ErrLogFileName, ReadLogFileName, UpdateLogFileName, CriticalLogFileName, QoSLogFileName}
for i := range logHandles {
if *logHandles[i], err = newLog(logNames[i]); err != nil {
return err
}
}
l.level = level
return nil
}
// SetPrefix sets the log prefix.
func (l *Log) SetPrefix(s, level string) string {
_, file, line, ok := runtime.Caller(2)
if !ok {
line = 0
}
short := file
for i := len(file) - 1; i > 0; i-- {
if file[i] == '/' {
short = file[i+1:]
break
}
}
file = short
return level + " " + file + ":" + strconv.Itoa(line) + ": " + s
}
// Flush flushes the log.
func (l *Log) Flush() {
loggers := []*LogObject{
l.debugLogger,
l.infoLogger,
l.warnLogger,
l.errorLogger,
l.readLogger,
l.updateLogger,
l.criticalLogger,
}
for _, logger := range loggers {
if logger != nil {
logger.Flush()
}
}
}
const (
SetLogLevelPath = "/loglevel/set"
)
func SetLogLevel(w http.ResponseWriter, r *http.Request) {
var err error
if err = r.ParseForm(); err != nil {
buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
levelStr := r.FormValue("level")
var level Level
switch strings.ToLower(levelStr) {
case "debug":
level = DebugLevel
case "info", "read", "write":
level = InfoLevel
case "warn":
level = WarnLevel
case "error":
level = ErrorLevel
case "critical":
level = CriticalLevel
case "fatal":
level = FatalLevel
default:
err = fmt.Errorf("level only can be set :debug,info,warn,error,critical,read,write,fatal")
buildFailureResp(w, http.StatusBadRequest, err.Error())
return
}
gLog.level = Level(level)
setBlobLogLevel(level)
buildSuccessResp(w, "set log level success")
}
func buildSuccessResp(w http.ResponseWriter, data interface{}) {
buildJSONResp(w, http.StatusOK, data, "")
}
func buildFailureResp(w http.ResponseWriter, code int, msg string) {
buildJSONResp(w, code, nil, msg)
}
// Create response for the API request.
func buildJSONResp(w http.ResponseWriter, code int, data interface{}, msg string) {
var (
jsonBody []byte
err error
)
w.WriteHeader(code)
w.Header().Set("Content-Type", "application/json")
body := struct {
Code int `json:"code"`
Data interface{} `json:"data"`
Msg string `json:"msg"`
}{
Code: code,
Data: data,
Msg: msg,
}
if jsonBody, err = json.Marshal(body); err != nil {
return
}
w.Write(jsonBody)
}
// LogWarn indicates the warnings.
func LogWarn(v ...interface{}) {
if gLog == nil {
return
}
if WarnLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[2])
gLog.warnLogger.Output(2, s)
}
// LogWarnf indicates the warnings with specific format.
func LogWarnf(format string, v ...interface{}) {
if gLog == nil {
return
}
if WarnLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[2])
gLog.warnLogger.Output(2, s)
}
// LogInfo indicates log the information. TODO explain
func LogInfo(v ...interface{}) {
if gLog == nil {
return
}
if InfoLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[1])
gLog.infoLogger.Output(2, s)
}
// LogInfo indicates log the information with specific format. TODO explain
func LogInfof(format string, v ...interface{}) {
if gLog == nil {
return
}
if InfoLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[1])
gLog.infoLogger.Output(2, s)
}
func EnableInfo() bool {
if gLog == nil {
return false
}
return InfoLevel&gLog.level == gLog.level
}
// LogError logs the errors.
func LogError(v ...interface{}) {
if gLog == nil {
return
}
if ErrorLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[3])
gLog.errorLogger.Output(2, s)
}
// LogErrorf logs the errors with the specified format.
func LogErrorf(format string, v ...interface{}) {
if gLog == nil {
return
}
if ErrorLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[3])
gLog.errorLogger.Print(s)
}
// LogDebug logs the debug information.
func LogDebug(v ...interface{}) {
if gLog == nil {
return
}
if DebugLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[0])
gLog.debugLogger.Print(s)
}
// LogDebugf logs the debug information with specified format.
func LogDebugf(format string, v ...interface{}) {
if gLog == nil {
return
}
if DebugLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[0])
gLog.debugLogger.Output(2, s)
}
func EnableDebug() bool {
if gLog == nil {
return false
}
return DebugLevel&gLog.level == gLog.level
}
// LogFatal logs the fatal errors.
func LogFatal(v ...interface{}) {
if gLog == nil {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[4])
gLog.errorLogger.Output(2, s)
gLog.Flush()
os.Exit(1)
}
// LogFatalf logs the fatal errors with specified format.
func LogFatalf(format string, v ...interface{}) {
if gLog == nil {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[4])
gLog.errorLogger.Output(2, s)
gLog.Flush()
os.Exit(1)
}
// LogFatal logs the fatal errors.
func LogCritical(v ...interface{}) {
if gLog == nil {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[4])
gLog.criticalLogger.Output(2, s)
gLog.outputStderr(2, s)
}
// LogFatalf logs the fatal errors with specified format.
func LogCriticalf(format string, v ...interface{}) {
if gLog == nil {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[4])
gLog.criticalLogger.Output(2, s)
gLog.outputStderr(2, s)
}
// LogRead
func LogRead(v ...interface{}) {
if gLog == nil {
return
}
if ReadLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[5])
gLog.readLogger.Output(2, s)
}
// TODO not used?
func LogReadf(format string, v ...interface{}) {
if gLog == nil {
return
}
if ReadLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[5])
gLog.readLogger.Output(2, s)
}
// QosWrite
func QosWrite(v ...interface{}) {
if gLog == nil {
return
}
if UpdateLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[0])
gLog.qosLogger.Output(2, s)
}
// QosWriteDebugf TODO not used
func QosWriteDebugf(format string, v ...interface{}) {
if gLog == nil {
return
}
if DebugLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[0])
gLog.qosLogger.Output(2, s)
}
// LogWrite
func LogWrite(v ...interface{}) {
if gLog == nil {
return
}
if UpdateLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintln(v...)
s = gLog.SetPrefix(s, levelPrefixes[6])
gLog.updateLogger.Output(2, s)
}
// LogWritef TODO not used
func LogWritef(format string, v ...interface{}) {
if gLog == nil {
return
}
if UpdateLevel&gLog.level != gLog.level {
return
}
s := fmt.Sprintf(format, v...)
s = gLog.SetPrefix(s, levelPrefixes[6])
gLog.updateLogger.Output(2, s)
}
// LogFlush flushes the log.
func LogFlush() {
if gLog != nil {
gLog.Flush()
}
}
func LogDisableStderrOutput() {
if gLog != nil {
gLog.DisableStderrOutput()
}
}
func (l *Log) checkLogRotation(logDir, module string) {
var needDelFiles RotatedFile
for {
needDelFiles = needDelFiles[:0]
// check disk space
fs := syscall.Statfs_t{}
if err := syscall.Statfs(logDir, &fs); err != nil {
LogErrorf("check disk space: %s", err.Error())
time.Sleep(DefaultRotateInterval)
continue
}
diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
diskSpaceLeft -= l.rotate.headRoom * 1024 * 1024
if diskSpaceLeft <= 0 {
LogDebugf("logLeftSpaceLimit has been reached, need to clear %v Mb of Space", (-diskSpaceLeft)/1024/1024)
}
err := l.removeLogFile(logDir, diskSpaceLeft, module)
if err != nil {
time.Sleep(DefaultRotateInterval)
continue
}
// check if it is time to rotate
now := time.Now()
if now.Day() == l.lastRolledTime.Day() {
time.Sleep(DefaultRotateInterval)
continue
}
// rotate log files
l.debugLogger.SetRotation()
l.infoLogger.SetRotation()
l.warnLogger.SetRotation()
l.errorLogger.SetRotation()
l.readLogger.SetRotation()
l.updateLogger.SetRotation()
l.criticalLogger.SetRotation()
l.lastRolledTime = now
}
}
func DeleteFileFilter(info os.FileInfo, diskSpaceLeft int64, module string) bool {
if diskSpaceLeft <= 0 {
return info.Mode().IsRegular() && strings.HasSuffix(info.Name(), RotatedExtension) && strings.HasPrefix(info.Name(), module)
}
return time.Since(info.ModTime()) > MaxReservedDays && strings.HasSuffix(info.Name(), RotatedExtension) && strings.HasPrefix(info.Name(), module)
}
func (l *Log) removeLogFile(logDir string, diskSpaceLeft int64, module string) (err error) {
// collect free file list
fInfos, err := ioutil.ReadDir(logDir)
if err != nil {
LogErrorf("error read log directory files: %s", err.Error())
return
}
var needDelFiles RotatedFile
for _, info := range fInfos {
if DeleteFileFilter(info, diskSpaceLeft, module) {
LogDebugf("%v will be put into needDelFiles", info.Name())
needDelFiles = append(needDelFiles, info)
}
}
sort.Sort(needDelFiles)
// delete old file
for _, info := range needDelFiles {
if err = os.Remove(path.Join(logDir, info.Name())); err != nil {
LogErrorf("failed delete log file %s", info.Name())
continue
}
diskSpaceLeft += info.Size()
if diskSpaceLeft > 0 && time.Since(info.ModTime()) < MaxReservedDays {
break
}
}
err = nil
return
}
// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package log
import (
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
)
const (
InvalidLogLevel = "Invalid log level, only support [error, warn, debug, info, read, update, critical]"
OpenLogFileFailed = "Failed to open log file"
GetLogNumFailed = "Failed to get param num"
TailLogFileFailed = "Failed to tail log file"
InvaildLogNum = ", invalid num param, use default num"
TooBigNum = ", param num is too big, use default max num"
LossNum = ", can't find num param, use default num"
GetLogPath = "/log/get"
buffSize = int64(4096)
maxLogLine = 10000
defaultLogLine = 100
)
// HTTPReply uniform response structure
type HTTPReply struct {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data interface{} `json:"data"`
}
func GetLog(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query()
levelStr := query.Get("level")
var fileName string
switch strings.ToLower(levelStr) {
case "error":
fileName = gLog.errorLogger.object.fileName
case "warn":
fileName = gLog.warnLogger.object.fileName
case "debug":
fileName = gLog.debugLogger.object.fileName
case "info":
fileName = gLog.infoLogger.object.fileName
case "read":
fileName = gLog.readLogger.object.fileName
case "update":
fileName = gLog.updateLogger.object.fileName
case "critical":
fileName = gLog.criticalLogger.object.fileName
default:
buildFailureResp(w, http.StatusBadRequest, InvalidLogLevel)
return
}
file, err := os.Open(fileName)
if err != nil {
buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", OpenLogFileFailed, err))
return
}
defer file.Close()
var msg string
var num int
numStr := query.Get("num")
if numStr == "" {
num = defaultLogLine
msg = fmt.Sprintf("%s(%d)", LossNum, defaultLogLine)
} else {
num, err = strconv.Atoi(numStr)
if err != nil {
buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", GetLogNumFailed, err))
return
}
}
if num <= 0 {
num = defaultLogLine
msg = fmt.Sprintf("%s(%d)", InvaildLogNum, defaultLogLine)
} else if num > maxLogLine {
num = maxLogLine
msg = fmt.Sprintf("%s(%d)", TooBigNum, maxLogLine)
}
data, err := tailn(num, file)
if err != nil {
buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", TailLogFileFailed, err))
return
}
sendOKReply(w, r, msg, data)
}
func tailn(line int, file *os.File) (data []string, err error) {
fileLen, err := file.Seek(0, io.SeekEnd)
if err != nil {
return
}
var dataLen int
var currNum int
var lastStr string
data = make([]string, line)
for {
currSize := buffSize
if currSize > fileLen {
currSize = fileLen
}
_, err = file.Seek(-currSize, io.SeekCurrent)
if err != nil {
return
}
buff := make([]byte, currSize)
dataLen, err = file.Read(buff)
if err != nil {
return
}
last := dataLen
for i := dataLen - 1; i >= 0; i-- {
if buff[i] == '\n' {
if i == dataLen-1 {
if lastStr != "" {
data[line-currNum] = lastStr
lastStr = ""
currNum++
if currNum >= line {
return
}
}
last = i
continue
}
currNum++
data[line-currNum] = string(buff[i+1:last]) + lastStr
lastStr = ""
if currNum >= line {
return
}
last = i
}
}
lastStr = string(buff[:last])
fileLen, err = file.Seek(-currSize, io.SeekCurrent)
if fileLen <= 0 {
break
}
}
if currNum < line {
data = data[line-currNum:]
}
return
}
func sendOKReply(w http.ResponseWriter, r *http.Request, msg string, data interface{}) {
reply := &HTTPReply{
Code: http.StatusOK,
Msg: "Success" + msg,
Data: data,
}
httpReply, err := json.Marshal(reply)
if err != nil {
buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", "", err))
return
}
send(w, r, httpReply)
}
func send(w http.ResponseWriter, r *http.Request, reply []byte) {
w.Header().Set("content-type", "application/json")
w.Header().Set("Content-Length", strconv.Itoa(len(reply)))
w.Write(reply)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package log
const (
// DefaultRotateSize Specifies at what size to rotate the output log at
// Units: byte
DefaultRotateSize = 1 * 1024 * 1024 * 1024
DefaultMinRotateSize = 200 * 1024 * 1024
// DefaultHeadRoom The tolerance for the log space limit (in megabytes)
DefaultHeadRoom = 50 * 1024
// DefaultHeadRatio The disk reserve space ratio
DefaultHeadRatio = 0.2
DefaultLogLeftSpaceLimit = 5 * 1024
)
// LogRotate A log can be rotated by the size or time.
type LogRotate struct {
rotateSize int64 // the size of the rotated log
headRoom int64 // capacity reserved for writing the next log on the disk
}
// NewLogRotate returns a new LogRotate instance.
func NewLogRotate() *LogRotate {
return &LogRotate{}
}
// SetRotateSizeMb sets the rotate size in terms of MB.
func (r *LogRotate) SetRotateSizeMb(size int64) {
r.rotateSize = size
}
// SetHeadRoomMb sets the headroom in terms of MB.
func (r *LogRotate) SetHeadRoomMb(size int64) {
r.headRoom = size
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
"sync"
"time"
"github.com/cubefs/cubefs/util/log"
)
const (
requestTimeout = 30 * time.Second
)
var ErrNoValidMaster = errors.New("no valid master")
// MasterHelper defines the helper struct to manage the master.
type MasterHelper interface {
AddNode(address string)
Nodes() []string
Leader() string
Request(method, path string, param, header map[string]string, body []byte) (data []byte, err error)
}
type masterHelper struct {
sync.RWMutex
masters []string
leaderAddr string
}
// AddNode add the given address as the master address.
func (helper *masterHelper) AddNode(address string) {
helper.Lock()
helper.updateMaster(address)
helper.Unlock()
}
// Leader returns the current leader address.
func (helper *masterHelper) Leader() (addr string) {
helper.RLock()
addr = helper.leaderAddr
helper.RUnlock()
return
}
// Change the leader address.
func (helper *masterHelper) setLeader(addr string) {
helper.Lock()
helper.leaderAddr = addr
helper.Unlock()
}
// Request sends out the request through the helper.
func (helper *masterHelper) Request(method, path string, param, header map[string]string, reqData []byte) (respData []byte, err error) {
respData, err = helper.request(method, path, param, header, reqData)
return
}
func (helper *masterHelper) request(method, path string, param, header map[string]string, reqData []byte) (repsData []byte, err error) {
leaderAddr, nodes := helper.prepareRequest()
host := leaderAddr
for i := -1; i < len(nodes); i++ {
if i == -1 {
if host == "" {
continue
}
} else {
host = nodes[i]
}
var resp *http.Response
resp, err = helper.httpRequest(method, fmt.Sprintf("http://%s%s", host,
path), param, header, reqData)
if err != nil {
log.LogErrorf("[masterHelper] %s", err)
continue
}
stateCode := resp.StatusCode
repsData, err = io.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
log.LogErrorf("[masterHelper] %s", err)
continue
}
switch stateCode {
case http.StatusForbidden:
curMasterAddr := strings.TrimSpace(string(repsData))
curMasterAddr = strings.Replace(curMasterAddr, "\n", "", -1)
if len(curMasterAddr) == 0 {
log.LogErrorf("[masterHelper] request[%s] response statudCode"+
"[403], respBody is empty", host)
err = ErrNoValidMaster
return
}
repsData, err = helper.request(method, path, param, header, reqData)
return
case http.StatusOK:
if leaderAddr != host {
helper.setLeader(host)
}
body := &struct {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data json.RawMessage `json:"data"`
}{}
if err := json.Unmarshal(repsData, body); err != nil {
return nil, fmt.Errorf("unmarshal response body err:%v", err)
}
// o represent proto.ErrCodeSuccess
if body.Code != 0 {
return nil, fmt.Errorf("request error, code[%d], msg[%s]", body.Code, body.Msg)
}
return []byte(body.Data), nil
default:
log.LogErrorf("[masterHelper] master[%v] uri[%v] statusCode[%v] respBody[%v].",
resp.Request.URL.String(), host, stateCode, string(repsData))
continue
}
}
err = ErrNoValidMaster
return
}
// Nodes returns all master addresses.
func (helper *masterHelper) Nodes() (nodes []string) {
helper.RLock()
nodes = helper.masters
helper.RUnlock()
return
}
// prepareRequest returns the leader address and all master addresses.
func (helper *masterHelper) prepareRequest() (addr string, nodes []string) {
helper.RLock()
addr = helper.leaderAddr
nodes = helper.masters
helper.RUnlock()
return
}
func (helper *masterHelper) httpRequest(method, url string, param, header map[string]string, reqData []byte) (resp *http.Response, err error) {
client := &http.Client{}
reader := bytes.NewReader(reqData)
client.Timeout = requestTimeout
var req *http.Request
fullUrl := helper.mergeRequestUrl(url, param)
log.LogDebugf("action[httpRequest] method[%v] url[%v] reqBodyLen[%v].", method, fullUrl, len(reqData))
if req, err = http.NewRequest(method, fullUrl, reader); err != nil {
return
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Connection", "close")
for k, v := range header {
req.Header.Set(k, v)
}
resp, err = client.Do(req)
return
}
func (helper *masterHelper) updateMaster(address string) {
contains := false
for _, master := range helper.masters {
if master == address {
contains = true
break
}
}
if !contains {
helper.masters = append(helper.masters, address)
}
helper.leaderAddr = address
}
func (helper *masterHelper) mergeRequestUrl(url string, params map[string]string) string {
if len(params) > 0 {
buff := bytes.NewBuffer([]byte(url))
isFirstParam := true
for k, v := range params {
if isFirstParam {
buff.WriteString("?")
isFirstParam = false
} else {
buff.WriteString("&")
}
buff.WriteString(k)
buff.WriteString("=")
buff.WriteString(v)
}
return buff.String()
}
return url
}
// NewMasterHelper returns a new MasterHelper instance.
func NewMasterHelper() MasterHelper {
return &masterHelper{}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
)
const (
MEMINFO = "/proc/meminfo"
PRO_MEM = "/proc/%d/status"
)
// GetMemInfo returns the memory information.
func GetMemInfo() (total, used uint64, err error) {
fp, err := os.Open(MEMINFO)
if err != nil {
return
}
// TODO Unhandled errors
defer fp.Close()
var (
val uint64
free uint64
buffer uint64
cached uint64
)
scan := bufio.NewScanner(fp)
for scan.Scan() {
line := scan.Text()
fields := strings.Split(line, ":")
if len(fields) != 2 {
continue
}
key := fields[0]
value := strings.TrimSpace(fields[1])
value = strings.Replace(value, " kB", "", -1)
val, err = strconv.ParseUint(value, 10, 64)
if err != nil {
return
}
switch key {
case "MemTotal":
total = val * KB
case "MemFree":
free = val * KB
case "Buffers":
buffer = val * KB
case "Cached":
cached = val * KB
default:
// do nothing
}
}
used = total - free - buffer - cached
return
}
func GetProcessMemory(pid int) (used uint64, err error) {
proFileName := fmt.Sprintf(PRO_MEM, pid)
fp, err := os.Open(proFileName)
if err != nil {
return
}
defer fp.Close()
scan := bufio.NewScanner(fp)
for scan.Scan() {
line := scan.Text()
fields := strings.Split(line, ":")
key := fields[0]
if key != "VmRSS" {
continue
}
value := strings.TrimSpace(fields[1])
value = strings.Replace(value, " kB", "", -1)
used, err = strconv.ParseUint(value, 10, 64)
if err != nil {
return
}
used = used * KB
break
}
return
}
// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"fmt"
"strconv"
"strings"
"github.com/google/uuid"
)
const (
multipartIDMetaLength = 25
multipartIDFlagLength = 2
multipartIDDelimiter = "x"
)
type MultipartID string
func (id MultipartID) String() string {
return string(id)
}
func (id MultipartID) PartitionID() (pID uint64, found bool) {
if len(id) < multipartIDMetaLength {
return
}
var (
mpStart int
mpEnd int
flag string
length int64
appendInfo []rune
mpIdString string
delimiterIndex int
err error
)
delimiterIndex = len(id) - multipartIDMetaLength
appendInfo = []rune(id)[delimiterIndex:]
if string(appendInfo[0]) != multipartIDDelimiter {
return
}
flag = string(appendInfo[1 : multipartIDFlagLength+1])
length, err = strconv.ParseInt(flag, 10, 32)
if err != nil {
return 0, false
}
mpStart = 1 + multipartIDFlagLength
mpEnd = mpStart + int(length)
mpIdString = string(appendInfo[mpStart:mpEnd])
pID, err = strconv.ParseUint(mpIdString, 10, 64)
found = err == nil
return
}
func MultipartIDFromString(src string) MultipartID {
return MultipartID(src)
}
func CreateMultipartID(mpId uint64) MultipartID {
var (
mpIdLength string
multipartId string
)
// Append special char 'x' and meta partition id after generated multipart id.
// If appended string length is less then 25, completion using random string
tempLength := len(strconv.FormatUint(mpId, 10))
// Meta partition id's length is fixed, if current length is not enough,
// append '0' in the beginning of current meta partition id
if len(strconv.Itoa(tempLength)) < multipartIDFlagLength {
for i := 0; i < multipartIDFlagLength-len(strconv.Itoa(tempLength)); i++ {
mpIdLength += "0"
}
mpIdLength += strconv.Itoa(tempLength)
}
appendMultipart := fmt.Sprintf("%s%d", mpIdLength, mpId)
nextId := strings.ReplaceAll(uuid.New().String(), "-", "")
if len(appendMultipart) < multipartIDMetaLength-1 {
l := multipartIDMetaLength - 1 - len(appendMultipart)
t := strings.ReplaceAll(uuid.New().String(), "-", "")
r := string([]rune(t)[:l])
multipartId = fmt.Sprintf("%s%s%s%s", nextId, multipartIDDelimiter, appendMultipart, r)
} else {
multipartId = fmt.Sprintf("%s%s%s", nextId, multipartIDDelimiter, appendMultipart)
}
return MultipartID(multipartId)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import "sync"
type (
Null struct{}
Set struct {
sync.RWMutex
m map[string]Null
}
)
func NewSet() *Set {
return &Set{
m: map[string]Null{},
}
}
func (s *Set) Add(val string) {
s.Lock()
defer s.Unlock()
s.m[val] = Null{}
}
func (s *Set) Remove(val string) {
s.Lock()
defer s.Unlock()
delete(s.m, val)
}
func (s *Set) Has(key string) bool {
s.RLock()
defer s.RUnlock()
_, ok := s.m[key]
return ok
}
func (s *Set) Len() int {
s.RLock()
defer s.RUnlock()
return len(s.m)
}
func (s *Set) Clear() {
s.Lock()
defer s.Unlock()
s.m = make(map[string]Null)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"fmt"
"io"
"net"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"unsafe"
"github.com/cubefs/cubefs/util/errors"
"github.com/xtaci/smux"
)
const (
DefaultSmuxPortShift = 500
)
const (
defaultCreateInterval = int64(time.Microsecond * 200)
)
var ErrTooMuchSmuxStreams = errors.New("too much smux streams")
// ShiftAddrPort changes the addr(ip:port) to afterShift(ip:(port+shift)).
func ShiftAddrPort(addr string, shift int) (afterShift string) {
pars := strings.Split(addr, ":")
if len(pars) != 2 {
return
}
ip, port := pars[0], pars[1]
portNum, err := strconv.Atoi(port)
if err != nil {
return
}
afterShift = fmt.Sprintf("%s:%d", ip, portNum+shift)
return
}
// FilterSmuxAcceptError filter smux accept error
func FilterSmuxAcceptError(err error) error {
if err == nil {
return nil
}
if err.Error() == io.EOF.Error() {
return nil
}
if operr, ok := err.(*net.OpError); ok {
if operr.Err == syscall.ECONNRESET {
return nil
}
}
return err
}
const (
streamPreAlloc = 1
connPreAlloc = 1
)
type SmuxConnPoolConfig struct {
*smux.Config
TotalStreams int
StreamsPerConn int
ConnsPerAddr int
PoolCapacity int
DialTimeout time.Duration
StreamIdleTimeout int64
}
func DefaultSmuxConnPoolConfig() *SmuxConnPoolConfig {
return &SmuxConnPoolConfig{
Config: DefaultSmuxConfig(),
TotalStreams: 1000000,
StreamsPerConn: 1,
ConnsPerAddr: 16,
PoolCapacity: 64,
DialTimeout: time.Second * 10,
StreamIdleTimeout: int64(time.Second * 60),
}
}
func VerifySmuxPoolConfig(cfg *SmuxConnPoolConfig) error {
if err := smux.VerifyConfig(cfg.Config); err != nil {
return err
}
if cfg.ConnsPerAddr <= 0 {
return errors.New("cfg.ConnsPerAddr must be larger than 0")
}
if cfg.PoolCapacity <= 0 {
return errors.New("cfg.PoolCapacity must be larger than 0")
}
if cfg.StreamsPerConn <= 0 {
return errors.New("cfg.StreamsPerConn must be larger than 0")
}
if cfg.StreamIdleTimeout < int64(10*time.Millisecond) {
return errors.New("cfg.StreamIdleTimeout too small, must be larger than 10ms")
}
if cfg.TotalStreams <= 0 {
return errors.New("cfg.TotalStreams must be larger than 0")
}
return nil
}
func DefaultSmuxConfig() *smux.Config {
return smux.DefaultConfig()
}
var gConfig = DefaultSmuxConnPoolConfig()
type SmuxConnPoolStat struct {
TotalStreams int `json:"totalStreams"`
TotalStreamsReported int `json:"totalStreamsInflight"`
Pools map[string]*SmuxPoolStat `json:"pools"`
TotalSessions int `json:"totalSessions"`
Bucket int `json:"bucket"`
}
// token bucket limit
type simpleTokenBucket struct {
bucket int64
notify chan struct{}
blocked bool
}
func newSimpleTokenBucket(n int64, blocked bool) *simpleTokenBucket {
return &simpleTokenBucket{
bucket: n,
notify: make(chan struct{}, 1),
blocked: blocked,
}
}
func (b *simpleTokenBucket) consumeTokens(n int) bool {
if atomic.AddInt64(&b.bucket, int64(-n)) < 0 {
if b.blocked {
<-b.notify
} else {
atomic.AddInt64(&b.bucket, int64(n))
return false
}
}
return true
}
func (b *simpleTokenBucket) returnTokens(n int) {
if atomic.AddInt64(&b.bucket, int64(n)) > 0 {
if b.blocked {
select {
case b.notify <- struct{}{}:
default:
}
}
}
}
type SmuxConnectPool struct {
sync.RWMutex
streamBucket *simpleTokenBucket
cfg *SmuxConnPoolConfig
pools map[string]*SmuxPool
closeCh chan struct{}
closeOnce sync.Once
}
func NewSmuxConnectPool(cfg *SmuxConnPoolConfig) (cp *SmuxConnectPool) {
if cfg == nil {
cfg = gConfig
}
cp = &SmuxConnectPool{
streamBucket: newSimpleTokenBucket(int64(cfg.TotalStreams), false),
cfg: cfg,
pools: make(map[string]*SmuxPool),
closeCh: make(chan struct{}),
closeOnce: sync.Once{},
}
go cp.autoRelease()
return cp
}
func (cp *SmuxConnectPool) GetConnect(targetAddr string) (c *smux.Stream, err error) {
cp.RLock()
pool, ok := cp.pools[targetAddr]
cp.RUnlock()
if !ok {
cp.Lock()
pool, ok = cp.pools[targetAddr]
if !ok {
pool = NewSmuxPool(cp.cfg, targetAddr, cp.streamBucket)
cp.pools[targetAddr] = pool
}
cp.Unlock()
}
return pool.GetConnect()
}
func (cp *SmuxConnectPool) PutConnect(stream *smux.Stream, forceClose bool) {
if stream == nil {
return
}
select {
case <-cp.closeCh:
return
default:
}
addr := stream.RemoteAddr().String()
cp.RLock()
pool, ok := cp.pools[addr]
cp.RUnlock()
if !ok {
return
}
if forceClose {
pool.MarkClosed(stream)
return
}
pool.PutStreamObjectToPool(&streamObject{stream: stream, idle: time.Now().UnixNano()})
}
func (cp *SmuxConnectPool) autoRelease() {
timer := time.NewTimer(time.Duration(cp.cfg.StreamIdleTimeout))
for {
select {
case <-cp.closeCh:
timer.Stop()
return
case <-timer.C:
}
pools := make([]*SmuxPool, 0)
cp.RLock()
for _, pool := range cp.pools {
pools = append(pools, pool)
}
cp.RUnlock()
for _, pool := range pools {
pool.autoRelease()
}
timer.Reset(time.Duration(cp.cfg.StreamIdleTimeout))
}
}
func (cp *SmuxConnectPool) releaseAll() {
pools := make([]*SmuxPool, 0)
cp.RLock()
for _, pool := range cp.pools {
pools = append(pools, pool)
}
cp.RUnlock()
for _, pool := range pools {
pool.ReleaseAll()
}
}
func (cp *SmuxConnectPool) Close() {
cp.closeOnce.Do(func() {
close(cp.closeCh)
cp.releaseAll()
})
}
func (cp *SmuxConnectPool) GetStat() *SmuxConnPoolStat {
stat := &SmuxConnPoolStat{
TotalStreams: 0,
TotalStreamsReported: 0,
Pools: make(map[string]*SmuxPoolStat),
TotalSessions: 0,
}
cp.RLock()
for remote, pool := range cp.pools {
stat.Pools[remote] = pool.GetStat()
}
cp.RUnlock()
for _, poolStat := range stat.Pools {
stat.TotalSessions += poolStat.TotalSessions
stat.TotalStreams += poolStat.InflightStreams
stat.TotalStreamsReported += poolStat.InflightStreamsReported
}
stat.Bucket = int(atomic.LoadInt64(&cp.streamBucket.bucket))
return stat
}
type createSessCall struct {
idle int64
notify chan struct{}
sess *smux.Session
err error
}
type streamObject struct {
stream *smux.Stream
idle int64
}
type SmuxPool struct {
target string
sessionsLock sync.RWMutex
sessionsIter int64
sessions []*smux.Session
cfg *SmuxConnPoolConfig
objects chan *streamObject
inflightStreams int64
createSessCall *createSessCall
streamBucket *simpleTokenBucket
}
type SmuxPoolStat struct {
Addr string `json:"addr"`
InflightStreams int `json:"inflightStreams"`
InflightStreamsReported int `json:"inflightStreamReported"`
TotalSessions int `json:"totalSessions"`
StreamsPerSession map[string]int `json:"streamsPerSession"`
}
func NewSmuxPool(cfg *SmuxConnPoolConfig, target string, streamBucket *simpleTokenBucket) (p *SmuxPool) {
if cfg == nil {
cfg = gConfig
}
p = &SmuxPool{
target: target,
sessions: make([]*smux.Session, 0, cfg.ConnsPerAddr),
cfg: cfg,
streamBucket: streamBucket,
objects: make(chan *streamObject, cfg.PoolCapacity),
}
p.initSessions()
p.initStreams()
return p
}
func (p *SmuxPool) initSessions() {
p.sessionsLock.Lock()
defer p.sessionsLock.Unlock()
for i := 0; i < connPreAlloc; i++ {
conn, err := net.DialTimeout("tcp", p.target, p.cfg.DialTimeout)
if err != nil {
continue
}
sess, err := smux.Client(conn, p.cfg.Config)
if err != nil {
conn.Close()
continue
}
p.sessions = append(p.sessions, sess)
}
}
func (p *SmuxPool) initStreams() {
for i := 0; i < streamPreAlloc; i++ {
stream, err := p.NewStream()
if err == nil {
p.PutStreamObjectToPool(&streamObject{
stream: stream,
idle: time.Now().UnixNano(),
})
}
}
}
func (p *SmuxPool) callCreate() (createCall *createSessCall) {
createCall = p.loadCreateCall()
if createCall == nil {
goto tryCreateNewSess
}
select {
case <-createCall.notify:
if time.Now().UnixNano()-createCall.idle > defaultCreateInterval {
goto tryCreateNewSess
} else {
return
}
// default:
}
tryCreateNewSess:
prev := createCall
createCall = &createSessCall{
idle: time.Now().UnixNano(),
notify: make(chan struct{}),
}
if p.casCreateCall(prev, createCall) {
go p.handleCreateCall(createCall)
return createCall
} else {
return p.loadCreateCall()
}
}
func (p *SmuxPool) autoRelease() {
poolLen := len(p.objects)
getFromPool:
for i := 0; i < poolLen; i++ {
select {
case obj := <-p.objects:
if streamClosed(obj.stream) {
p.MarkClosed(obj.stream)
} else if time.Now().UnixNano()-obj.idle > p.cfg.StreamIdleTimeout {
obj.stream.Close()
p.MarkClosed(obj.stream)
} else {
p.PutStreamObjectToPool(obj)
}
default:
break getFromPool
}
}
p.sessionsLock.Lock()
defer p.sessionsLock.Unlock()
sessionsLen := len(p.sessions)
hole := 0
for i := 0; i+hole < sessionsLen; {
o := p.sessions[i]
if o.IsClosed() {
p.sessions[i] = nil
hole++
} else if o.NumStreams() == 0 {
o.Close()
p.sessions[i] = nil
hole++
} else {
i++
}
if hole > 0 && i+hole < sessionsLen {
p.sessions[i] = p.sessions[i+hole]
}
}
if hole > 0 {
p.sessions = p.sessions[:sessionsLen-hole]
}
}
func streamClosed(stream *smux.Stream) bool {
select {
case <-stream.GetDieCh():
return true
default:
return false
}
}
func (p *SmuxPool) canUse(sess *smux.Session) bool {
if sess == nil || sess.IsClosed() {
return false
}
streamNum := sess.NumStreams()
if streamNum > 0 {
if streamNum < p.cfg.StreamsPerConn {
return true
}
maxStreams := p.cfg.StreamsPerConn * p.cfg.ConnsPerAddr
inflight := p.inflightStreamNum()
if inflight >= maxStreams {
// oversold
return streamNum <= ((inflight / p.cfg.ConnsPerAddr) + 1)
} else {
return false
}
} else {
return true
}
}
func (p *SmuxPool) ReleaseAll() {
p.sessionsLock.Lock()
defer p.sessionsLock.Unlock()
sessionsLen := len(p.sessions)
for i := 0; i < sessionsLen; i++ {
o := p.sessions[i]
if o != nil {
o.Close()
p.sessions[i] = nil
}
}
p.sessions = p.sessions[:0]
}
func (p *SmuxPool) getAvailSess() (sess *smux.Session) {
// every time start from different pos
iter := atomic.AddInt64(&p.sessionsIter, 1) - 1
p.sessionsLock.RLock()
sessionsLen := len(p.sessions)
for i := 0; i < sessionsLen; i++ {
o := p.sessions[(int64(i)+iter)%int64(sessionsLen)]
if p.canUse(o) {
sess = o
break
}
}
p.sessionsLock.RUnlock()
return
}
func (p *SmuxPool) insertSession(sess *smux.Session) {
p.sessionsLock.Lock()
// replace
for i, o := range p.sessions {
if o == nil || o.IsClosed() {
p.sessions[i] = sess
p.sessionsLock.Unlock()
return
}
}
// or append
p.sessions = append(p.sessions, sess)
p.sessionsLock.Unlock()
}
func (p *SmuxPool) GetConnect() (*smux.Stream, error) {
poolLen := len(p.objects)
getFromPool:
for i := 0; i < poolLen; i++ {
select {
case obj := <-p.objects:
if obj != nil {
select {
case <-obj.stream.GetDieCh():
p.MarkClosed(obj.stream)
continue getFromPool
default:
return obj.stream, nil
}
}
default:
break getFromPool
}
}
return p.NewStream()
}
func (p *SmuxPool) NewStream() (stream *smux.Stream, err error) {
sess := p.getAvailSess()
if sess != nil {
stream, err = p.openStream(sess)
if err != nil {
goto createNewSession
} else {
return
}
}
createNewSession:
call := p.callCreate()
<-call.notify
if call.err != nil {
return nil, call.err
} else {
return p.openStream(call.sess)
}
}
func (p *SmuxPool) MarkClosed(s *smux.Stream) {
s.Close()
p.addInflightStream(-1)
p.streamBucket.returnTokens(1)
}
func (p *SmuxPool) addInflightStream(n int) int {
return int(atomic.AddInt64(&p.inflightStreams, int64(n)))
}
func (p *SmuxPool) inflightStreamNum() int {
return int(atomic.LoadInt64(&p.inflightStreams))
}
func (p *SmuxPool) loadCreateCall() *createSessCall {
return (*createSessCall)(atomic.LoadPointer((*unsafe.Pointer)(unsafe.Pointer(&p.createSessCall))))
}
func (p *SmuxPool) casCreateCall(prev *createSessCall, new *createSessCall) bool {
return atomic.CompareAndSwapPointer((*unsafe.Pointer)(unsafe.Pointer(&p.createSessCall)),
unsafe.Pointer(prev), unsafe.Pointer(new))
}
func (p *SmuxPool) handleCreateCall(call *createSessCall) {
var conn net.Conn
defer close(call.notify)
conn, call.err = net.DialTimeout("tcp", p.target, p.cfg.DialTimeout)
if call.err != nil {
return
}
c := conn.(*net.TCPConn)
c.SetKeepAlive(true)
c.SetNoDelay(true)
call.sess, call.err = smux.Client(conn, p.cfg.Config)
if call.err != nil {
c.Close()
return
}
p.insertSession(call.sess)
}
func (p *SmuxPool) openStream(sess *smux.Session) (stream *smux.Stream, err error) {
if !p.streamBucket.consumeTokens(1) {
return nil, ErrTooMuchSmuxStreams
}
stream, err = sess.OpenStream()
if err == nil {
p.addInflightStream(1)
} else {
p.streamBucket.returnTokens(1)
}
return
}
func (p *SmuxPool) PutStreamObjectToPool(obj *streamObject) {
if streamClosed(obj.stream) {
p.MarkClosed(obj.stream)
return
}
select {
case p.objects <- obj:
return
default:
p.MarkClosed(obj.stream)
}
}
func (p *SmuxPool) GetStat() *SmuxPoolStat {
stat := &SmuxPoolStat{
Addr: p.target,
InflightStreams: 0,
InflightStreamsReported: 0,
TotalSessions: 0,
StreamsPerSession: make(map[string]int, p.cfg.ConnsPerAddr),
}
p.sessionsLock.RLock()
stat.TotalSessions = len(p.sessions)
stat.InflightStreamsReported = p.inflightStreamNum()
for _, sess := range p.sessions {
streams := sess.NumStreams()
stat.InflightStreams += streams
stat.StreamsPerSession[sess.LocalAddr().String()] += streams
}
p.sessionsLock.RUnlock()
return stat
}
// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package stat
import (
"bufio"
"errors"
"fmt"
"io/ioutil"
"os"
"path"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/cubefs/cubefs/util/log"
)
const (
Stat_Module = "mem_stat"
FileNameDateFormat = "20060102150405"
ShiftedExtension = ".old"
PRO_MEM = "/proc/%d/status"
F_OK = 0
MaxTimeoutLevel = 3
DefaultStatLogSize = 200 * 1024 * 1024 // 200M
DefaultHeadRoom = 50 * 1024 // 50G
MaxReservedDays = 7 * 24 * time.Hour
)
var DefaultTimeOutUs = [MaxTimeoutLevel]uint32{100000, 500000, 1000000}
var DefaultStatInterval = 60 * time.Second // 60 seconds
var re = regexp.MustCompile(`\([0-9]*\)`)
type ShiftedFile []os.FileInfo
func (f ShiftedFile) Less(i, j int) bool {
return f[i].ModTime().Before(f[j].ModTime())
}
func (f ShiftedFile) Len() int {
return len(f)
}
func (f ShiftedFile) Swap(i, j int) {
f[i], f[j] = f[j], f[i]
}
type typeInfo struct {
typeName string
allCount uint32
failCount uint32
maxTime time.Duration
minTime time.Duration
allTimeUs time.Duration
timeOut [MaxTimeoutLevel]uint32
}
type Statistic struct {
logDir string
logMaxSize int64
logBaseName string
pid int
lastClearTime time.Time
timeOutUs [MaxTimeoutLevel]uint32
typeInfoMap map[string]*typeInfo
closeStat bool
useMutex bool
sync.Mutex
}
var gSt *Statistic = nil
func NewStatistic(dir, logModule string, logMaxSize int64, timeOutUs [MaxTimeoutLevel]uint32, useMutex bool) (*Statistic, error) {
dir = path.Join(dir, logModule)
fi, err := os.Stat(dir)
if err != nil {
os.MkdirAll(dir, 0o755)
} else {
if !fi.IsDir() {
return nil, errors.New(dir + " is not a directory")
}
}
_ = os.Chmod(dir, 0o755)
logName := path.Join(dir, Stat_Module)
st := &Statistic{
logDir: dir,
logMaxSize: logMaxSize,
logBaseName: logName,
pid: os.Getpid(),
lastClearTime: time.Time{},
timeOutUs: timeOutUs,
typeInfoMap: make(map[string]*typeInfo),
closeStat: false,
useMutex: useMutex,
Mutex: sync.Mutex{},
}
gSt = st
go st.flushScheduler()
return st, nil
}
// TODO: how to close?
func (st *Statistic) flushScheduler() {
timer := time.NewTimer(DefaultStatInterval)
defer timer.Stop()
for {
<-timer.C
err := WriteStat()
if err != nil {
log.LogErrorf("WriteStat error: %v", err)
}
timer.Reset(DefaultStatInterval)
fs := syscall.Statfs_t{}
if err := syscall.Statfs(st.logDir, &fs); err != nil {
log.LogErrorf("Get fs stat failed, err: %v", err)
continue
}
diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
diskSpaceLeft -= DefaultHeadRoom * 1024 * 1024
removeLogFile(diskSpaceLeft, Stat_Module)
}
}
func removeLogFile(diskSpaceLeft int64, module string) {
fInfos, err := ioutil.ReadDir(gSt.logDir)
if err != nil {
log.LogErrorf("ReadDir failed, logDir: %s, err: %v", gSt.logDir, err)
return
}
var needDelFiles ShiftedFile
for _, info := range fInfos {
if deleteFileFilter(info, diskSpaceLeft, module) {
needDelFiles = append(needDelFiles, info)
}
}
sort.Sort(needDelFiles)
for _, info := range needDelFiles {
if err = os.Remove(path.Join(gSt.logDir, info.Name())); err != nil {
log.LogErrorf("Remove log file failed, logFileName: %s, err: %v", info.Name(), err)
continue
}
diskSpaceLeft += info.Size()
if diskSpaceLeft > 0 && time.Since(info.ModTime()) < MaxReservedDays {
break
}
}
}
func deleteFileFilter(info os.FileInfo, diskSpaceLeft int64, module string) bool {
if diskSpaceLeft <= 0 {
return info.Mode().IsRegular() && strings.HasSuffix(info.Name(), ShiftedExtension) && strings.HasPrefix(info.Name(), module)
}
return time.Since(info.ModTime()) > MaxReservedDays && strings.HasSuffix(info.Name(), ShiftedExtension) && strings.HasPrefix(info.Name(), module)
}
func CloseStat() {
if gSt == nil {
return
}
gSt.closeStat = true
}
func BeginStat() (bgTime *time.Time) {
bg := time.Now()
return &bg
}
func EndStat(typeName string, err error, bgTime *time.Time, statCount uint32) error {
if gSt == nil {
return nil
}
if gSt.closeStat {
return nil
}
if gSt.useMutex {
gSt.Lock()
defer gSt.Unlock()
}
if err != nil {
newErrStr := string(re.ReplaceAll([]byte(err.Error()), []byte("(xxx)")))
baseLen := len(typeName) + 2
if len(newErrStr)+baseLen > 41 {
typeName = typeName + "[" + newErrStr[:41-baseLen] + "]"
} else {
typeName = typeName + "[" + newErrStr + "]"
}
}
return addStat(typeName, err, bgTime, statCount)
}
func WriteStat() error {
if gSt == nil {
return nil
}
if gSt.useMutex {
gSt.Lock()
defer gSt.Unlock()
}
logFileName := gSt.logBaseName + ".log"
statFile, err := os.OpenFile(logFileName, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0o666)
if err != nil {
log.LogErrorf("OpenLogFile failed, logFileName: %s, err: %v\n", logFileName, err)
return fmt.Errorf("OpenLogFile failed, logFileName %s\n", logFileName)
}
defer statFile.Close()
statSpan := time.Since(gSt.lastClearTime) / 1e9
ioStream := bufio.NewWriter(statFile)
defer ioStream.Flush()
fmt.Fprintf(ioStream, "\n=============== Statistic in %ds, %s =====================\n",
statSpan, time.Now().Format("2006-01-02 15:04:05"))
if virt, res, err := GetProcessMemory(gSt.pid); err != nil {
log.LogErrorf("Get process memory failed, err: %v", err)
fmt.Fprintf(ioStream, "Get Mem Failed.\n")
} else {
fmt.Fprintf(ioStream, "Mem Allocated(kB): VIRT %-10d RES %-10d\n", virt, res)
}
fmt.Fprintf(ioStream, "%-42s|%10s|%8s|%8s|%8s|%8s|%8s|%8s|%8s|\n",
"", "TOTAL", "FAILED", "AVG(ms)", "MAX(ms)", "MIN(ms)",
">"+strconv.Itoa(int(gSt.timeOutUs[0])/1000)+"ms",
">"+strconv.Itoa(int(gSt.timeOutUs[1])/1000)+"ms",
">"+strconv.Itoa(int(gSt.timeOutUs[2])/1000)+"ms")
typeNames := make([]string, 0)
for typeName := range gSt.typeInfoMap {
typeNames = append(typeNames, typeName)
}
sort.Strings(typeNames)
for _, typeName := range typeNames {
typeInfo := gSt.typeInfoMap[typeName]
avgUs := int32(0)
if typeInfo.allCount > 0 {
avgUs = int32(typeInfo.allTimeUs / time.Duration(typeInfo.allCount))
}
fmt.Fprintf(ioStream, "%-42s|%10d|%8d|%8.2f|%8.2f|%8.2f|%8d|%8d|%8d|\n",
typeInfo.typeName, typeInfo.allCount, typeInfo.failCount,
float32(avgUs)/1000, float32(typeInfo.maxTime)/1000, float32(typeInfo.minTime)/1000,
typeInfo.timeOut[0], typeInfo.timeOut[1], typeInfo.timeOut[2])
}
fmt.Fprintf(ioStream, "-------------------------------------------------------------------"+
"--------------------------------------------------\n")
// clear stat
gSt.lastClearTime = time.Now()
gSt.typeInfoMap = make(map[string]*typeInfo)
shiftFiles()
return nil
}
func ClearStat() {
if gSt == nil {
return
}
if gSt.useMutex {
gSt.Lock()
defer gSt.Unlock()
}
gSt.lastClearTime = time.Now()
gSt.typeInfoMap = make(map[string]*typeInfo)
}
func addStat(typeName string, err error, bgTime *time.Time, statCount uint32) error {
if gSt == nil {
return nil
}
if len(typeName) == 0 {
return fmt.Errorf("AddStat fail, typeName %s\n", typeName)
}
if typeInfo, ok := gSt.typeInfoMap[typeName]; ok {
typeInfo.allCount += statCount
if err != nil {
typeInfo.failCount += statCount
}
addTime(typeInfo, bgTime)
return nil
}
typeInfo := &typeInfo{
typeName: typeName,
allCount: statCount,
failCount: 0,
maxTime: 0,
minTime: 0,
allTimeUs: 0,
timeOut: [3]uint32{},
}
if err != nil {
typeInfo.failCount = statCount
}
gSt.typeInfoMap[typeName] = typeInfo
addTime(typeInfo, bgTime)
return nil
}
func addTime(typeInfo *typeInfo, bgTime *time.Time) {
if bgTime == nil {
return
}
timeCostUs := time.Since(*bgTime) / 1e3
if timeCostUs == 0 {
return
}
if timeCostUs >= time.Duration(gSt.timeOutUs[0]) && timeCostUs < time.Duration(gSt.timeOutUs[1]) {
typeInfo.timeOut[0]++
} else if timeCostUs >= time.Duration(gSt.timeOutUs[1]) && timeCostUs < time.Duration(gSt.timeOutUs[2]) {
typeInfo.timeOut[1]++
} else if timeCostUs > time.Duration(gSt.timeOutUs[2]) {
typeInfo.timeOut[2]++
}
if timeCostUs > typeInfo.maxTime {
typeInfo.maxTime = timeCostUs
}
if typeInfo.minTime == 0 || timeCostUs < typeInfo.minTime {
typeInfo.minTime = timeCostUs
}
typeInfo.allTimeUs += timeCostUs
}
func shiftFiles() error {
logFileName := gSt.logBaseName + ".log"
fileInfo, err := os.Stat(logFileName)
if err != nil {
return err
}
if fileInfo.Size() < gSt.logMaxSize {
return nil
}
if syscall.Access(logFileName, F_OK) == nil {
logNewFileName := logFileName + "." + time.Now().Format(
FileNameDateFormat) + ShiftedExtension
if syscall.Rename(logFileName, logNewFileName) != nil {
log.LogErrorf("RenameFile failed, logFileName: %s, logNewFileName: %s, err: %v\n",
logFileName, logNewFileName, err)
return fmt.Errorf("RenameFile failed, logFileName %s, logNewFileName %s\n",
logFileName, logNewFileName)
}
}
return nil
}
func StatBandWidth(typeName string, Size uint32) {
EndStat(typeName+"[FLOW_KB]", nil, nil, Size/1024)
}
func GetMememory() (Virt, Res uint64, err error) {
return GetProcessMemory(gSt.pid)
}
func GetProcessMemory(pid int) (Virt, Res uint64, err error) {
proFileName := fmt.Sprintf(PRO_MEM, pid)
fp, err := os.Open(proFileName)
if err != nil {
return
}
defer fp.Close()
scan := bufio.NewScanner(fp)
for scan.Scan() {
line := scan.Text()
fields := strings.Split(line, ":")
key := fields[0]
if key == "VmRSS" {
value := strings.TrimSpace(fields[1])
value = strings.Replace(value, " kB", "", -1)
Res, err = strconv.ParseUint(value, 10, 64)
if err != nil {
return
}
} else if key == "VmSize" {
value := strings.TrimSpace(fields[1])
value = strings.Replace(value, " kB", "", -1)
Virt, err = strconv.ParseUint(value, 10, 64)
if err != nil {
return
}
} else {
continue
}
}
return
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"crypto/rand"
"fmt"
"math/big"
"strconv"
"strings"
)
func SubString(sourceString string, begin, end int) string {
bytes := []byte(sourceString)
stringLength := len(bytes)
if begin < 0 {
begin = 0
}
if end > stringLength {
end = stringLength
}
return string(bytes[begin:end])
}
type RandomSeed byte
func (s RandomSeed) Runes() []rune {
sourceBuilder := strings.Builder{}
if s&Numeric > 0 {
sourceBuilder.WriteString("0123456789")
}
if s&LowerLetter > 0 {
sourceBuilder.WriteString("abcdefghijklmnopqrstuvwxyz")
}
if s&UpperLetter > 0 {
sourceBuilder.WriteString("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
}
return []rune(sourceBuilder.String())
}
const (
Numeric RandomSeed = 1 << iota
LowerLetter
UpperLetter
)
func RandomString(length int, seed RandomSeed) string {
runs := seed.Runes()
result := ""
for i := 0; i < length; i++ {
lenInt64 := int64(len(runs))
randNumber, _ := rand.Int(rand.Reader, big.NewInt(lenInt64))
result += string(runs[randNumber.Uint64()])
}
return result
}
// Any2String format any value to string.
func Any2String(value interface{}) string {
var val string
switch v := value.(type) {
case string:
val = v
case *string:
val = *v
case bool:
val = strconv.FormatBool(v)
case *bool:
val = strconv.FormatBool(*v)
case int:
val = strconv.FormatInt(int64(v), 10)
case int8:
val = strconv.FormatInt(int64(v), 10)
case int16:
val = strconv.FormatInt(int64(v), 10)
case int32:
val = strconv.FormatInt(int64(v), 10)
case int64:
val = strconv.FormatInt(int64(v), 10)
case *int:
val = strconv.FormatInt(int64(*v), 10)
case *int8:
val = strconv.FormatInt(int64(*v), 10)
case *int16:
val = strconv.FormatInt(int64(*v), 10)
case *int32:
val = strconv.FormatInt(int64(*v), 10)
case *int64:
val = strconv.FormatInt(int64(*v), 10)
case uint:
val = strconv.FormatUint(uint64(v), 10)
case uint8:
val = strconv.FormatUint(uint64(v), 10)
case uint16:
val = strconv.FormatUint(uint64(v), 10)
case uint32:
val = strconv.FormatUint(uint64(v), 10)
case uint64:
val = strconv.FormatUint(uint64(v), 10)
case *uint:
val = strconv.FormatUint(uint64(*v), 10)
case *uint8:
val = strconv.FormatUint(uint64(*v), 10)
case *uint16:
val = strconv.FormatUint(uint64(*v), 10)
case *uint32:
val = strconv.FormatUint(uint64(*v), 10)
case *uint64:
val = strconv.FormatUint(uint64(*v), 10)
case float32:
val = strconv.FormatFloat(float64(v), 'f', 6, 64)
case float64:
val = strconv.FormatFloat(float64(v), 'f', 6, 64)
case *float32:
val = strconv.FormatFloat(float64(*v), 'f', 6, 64)
case *float64:
val = strconv.FormatFloat(float64(*v), 'f', 6, 64)
case complex64:
val = strconv.FormatComplex(complex128(v), 'f', 6, 64)
case complex128:
val = strconv.FormatComplex(complex128(v), 'f', 6, 64)
case *complex64:
val = strconv.FormatComplex(complex128(*v), 'f', 6, 64)
case *complex128:
val = strconv.FormatComplex(complex128(*v), 'f', 6, 64)
default:
val = fmt.Sprintf("%v", value)
}
return val
}
// Any2String parse string to pointer of value.
func String2Any(str string, pvalue interface{}) error {
var val interface{}
var err error
switch v := pvalue.(type) {
case *string:
val = str
case *bool:
val, err = strconv.ParseBool(str)
case *int:
val, err = strconv.ParseInt(str, 10, 0)
case *int8:
val, err = strconv.ParseInt(str, 10, 8)
case *int16:
val, err = strconv.ParseInt(str, 10, 16)
case *int32:
val, err = strconv.ParseInt(str, 10, 32)
case *int64:
val, err = strconv.ParseInt(str, 10, 64)
case *uint:
val, err = strconv.ParseUint(str, 10, 0)
case *uint8:
val, err = strconv.ParseUint(str, 10, 8)
case *uint16:
val, err = strconv.ParseUint(str, 10, 16)
case *uint32:
val, err = strconv.ParseUint(str, 10, 32)
case *uint64:
val, err = strconv.ParseUint(str, 10, 64)
case *float32:
val, err = strconv.ParseFloat(str, 32)
case *float64:
val, err = strconv.ParseFloat(str, 64)
case *complex64:
val, err = strconv.ParseComplex(str, 64)
case *complex128:
val, err = strconv.ParseComplex(str, 128)
default:
return fmt.Errorf("unknown type %v of %s %v", v, str, pvalue)
}
if err != nil {
return err
}
switch v := pvalue.(type) {
case *string:
*v = val.(string)
case *bool:
*v = val.(bool)
case *int:
*v = int(val.(int64))
case *int8:
*v = int8(val.(int64))
case *int16:
*v = int16(val.(int64))
case *int32:
*v = int32(val.(int64))
case *int64:
*v = int64(val.(int64))
case *uint:
*v = uint(val.(uint64))
case *uint8:
*v = uint8(val.(uint64))
case *uint16:
*v = uint16(val.(uint64))
case *uint32:
*v = uint32(val.(uint64))
case *uint64:
*v = uint64(val.(uint64))
case *float32:
*v = float32(val.(float64))
case *float64:
*v = float64(val.(float64))
case *complex64:
*v = complex64(val.(complex128))
case *complex128:
*v = complex128(val.(complex128))
}
return nil
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package synclist
import (
"container/list"
"sync"
)
type SyncList struct {
list.List
mu sync.RWMutex
}
func New() *SyncList {
l := new(SyncList)
l.Init()
return l
}
func (l *SyncList) Init() *SyncList {
l.mu.Lock()
l.List.Init()
l.mu.Unlock()
return l
}
func (l *SyncList) Remove(e *list.Element) interface{} {
l.mu.Lock()
defer l.mu.Unlock()
return l.List.Remove(e)
}
func (l *SyncList) PushFront(v interface{}) *list.Element {
l.mu.Lock()
defer l.mu.Unlock()
return l.List.PushFront(v)
}
func (l *SyncList) Back() *list.Element {
l.mu.RLock()
defer l.mu.RUnlock()
return l.List.Back()
}
func (l *SyncList) PushBack(v interface{}) *list.Element {
l.mu.Lock()
defer l.mu.Unlock()
return l.List.PushBack(v)
}
func (l *SyncList) InsertBefore(v interface{}, mark *list.Element) *list.Element {
l.mu.Lock()
defer l.mu.Unlock()
return l.List.InsertBefore(v, mark)
}
func (l *SyncList) InsertAfter(v interface{}, mark *list.Element) *list.Element {
l.mu.Lock()
defer l.mu.Unlock()
return l.List.InsertAfter(v, mark)
}
func (l *SyncList) Len() int {
l.mu.RLock()
defer l.mu.RUnlock()
return l.List.Len()
}
func (l *SyncList) Front() *list.Element {
l.mu.RLock()
defer l.mu.RUnlock()
return l.List.Front()
}
func (l *SyncList) MoveToFront(e *list.Element) {
l.mu.Lock()
l.List.MoveToFront(e)
l.mu.Unlock()
}
func (l *SyncList) MoveToBack(e *list.Element) {
l.mu.Lock()
l.List.MoveToBack(e)
l.mu.Unlock()
}
func (l *SyncList) MoveBefore(e, mark *list.Element) {
l.mu.Lock()
l.List.MoveBefore(e, mark)
l.mu.Unlock()
}
func (l *SyncList) MoveAfter(e, mark *list.Element) {
l.mu.Lock()
l.List.MoveAfter(e, mark)
l.mu.Unlock()
}
func (l *SyncList) PushBackList(other *SyncList) {
l.mu.Lock()
l.List.PushBackList(&other.List)
l.mu.Unlock()
}
func (l *SyncList) PushFrontList(other *SyncList) {
l.mu.Lock()
l.List.PushFrontList(&other.List)
l.mu.Unlock()
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package timeutil
import (
"sync/atomic"
"time"
)
// GetCurrentTime returns the current time.
func GetCurrentTime() time.Time {
return now.time.Load().(time.Time)
}
// GetCurrentTimeUnix returns the current time unix.
func GetCurrentTimeUnix() int64 {
return atomic.LoadInt64(&now.timeUnix)
}
var now = newNowTime()
// nowTime defines the current time.
type nowTime struct {
time atomic.Value // store time.Time
timeUnix int64
}
// newNowTime returns a new nowTime.
func newNowTime() *nowTime {
n := time.Now()
t := &nowTime{timeUnix: n.Unix()}
t.time.Store(n)
go func() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
<-ticker.C
n := time.Now()
t.time.Store(n)
atomic.StoreInt64(&t.timeUnix, n.Unix())
}
}()
return t
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package ump
import (
"os"
"strconv"
"sync"
"time"
)
type TpObject struct {
StartTime time.Time
EndTime time.Time
UmpType interface{}
}
func NewTpObject() (o *TpObject) {
o = new(TpObject)
o.StartTime = time.Now()
return
}
const (
TpMethod = "TP"
HeartbeatMethod = "Heartbeat"
FunctionError = "FunctionError"
)
var (
HostName string
LogTimeForMat = "20060102150405000"
AlarmPool = &sync.Pool{New: func() interface{} {
return new(BusinessAlarm)
}}
TpObjectPool = &sync.Pool{New: func() interface{} {
return new(TpObject)
}}
SystemAlivePool = &sync.Pool{New: func() interface{} {
return new(SystemAlive)
}}
FunctionTpPool = &sync.Pool{New: func() interface{} {
return new(FunctionTp)
}}
enableUmp = true
)
func InitUmp(module, dataDir string) (err error) {
if _, err = os.Stat(dataDir); err != nil {
enableUmp = false
err = nil
return
}
if err = initLogName(module, dataDir); err != nil {
enableUmp = false
err = nil
return
}
backGroudWrite()
return nil
}
func BeforeTP(key string) (o *TpObject) {
if !enableUmp {
return
}
o = TpObjectPool.Get().(*TpObject)
o.StartTime = time.Now()
tp := FunctionTpPool.Get().(*FunctionTp)
tp.HostName = HostName
tp.Time = time.Now().Format(LogTimeForMat)
tp.Key = key
tp.ProcessState = "0"
o.UmpType = tp
return
}
func AfterTP(o *TpObject, err error) {
if !enableUmp {
return
}
tp := o.UmpType.(*FunctionTp)
tp.ElapsedTime = strconv.FormatInt((int64)(time.Since(o.StartTime)/1e6), 10)
TpObjectPool.Put(o)
tp.ProcessState = "0"
if err != nil {
tp.ProcessState = "1"
}
select {
case FunctionTpLogWrite.logCh <- tp:
default:
}
}
func Alive(key string) {
if !enableUmp {
return
}
alive := SystemAlivePool.Get().(*SystemAlive)
alive.HostName = HostName
alive.Key = key
alive.Time = time.Now().Format(LogTimeForMat)
select {
case SystemAliveLogWrite.logCh <- alive:
default:
}
}
func Alarm(key, detail string) {
if !enableUmp {
return
}
alarm := AlarmPool.Get().(*BusinessAlarm)
alarm.Time = time.Now().Format(LogTimeForMat)
alarm.Key = key
alarm.HostName = HostName
alarm.BusinessType = "0"
alarm.Value = "0"
alarm.Detail = detail
if len(alarm.Detail) > 512 {
rs := []rune(detail)
alarm.Detail = string(rs[0:510])
}
select {
case BusinessAlarmLogWrite.logCh <- alarm:
default:
}
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package ump
import (
"bytes"
"encoding/json"
"fmt"
"net"
"os"
"strings"
)
type FunctionTp struct {
Time string `json:"time"`
Key string `json:"key"`
HostName string `json:"hostname"`
ProcessState string `json:"processState"`
ElapsedTime string `json:"elapsedTime"`
}
type SystemAlive struct {
Key string `json:"key"`
HostName string `json:"hostname"`
Time string `json:"time"`
}
type BusinessAlarm struct {
Time string `json:"time"`
Key string `json:"key"`
HostName string `json:"hostname"`
BusinessType string `json:"type"`
Value string `json:"value"`
Detail string `json:"detail"`
}
const (
FunctionTpSufixx = "tp.log"
SystemAliveSufixx = "alive.log"
BusinessAlarmSufixx = "business.log"
LogFileOpt = os.O_RDWR | os.O_CREATE | os.O_APPEND
ChSize = 102400
BusinessAlarmType = "BusinessAlarm"
SystemAliveType = "SystemAlive"
FunctionTpType = "FunctionTp"
HostNameFile = "/proc/sys/kernel/hostname"
MaxLogSize = 1024 * 1024 * 10
)
var (
FunctionTpLogWrite = &LogWrite{logCh: make(chan interface{}, ChSize)}
SystemAliveLogWrite = &LogWrite{logCh: make(chan interface{}, ChSize)}
BusinessAlarmLogWrite = &LogWrite{logCh: make(chan interface{}, ChSize)}
UmpDataDir = "/export/home/tomcat/UMP-Monitor/logs/"
)
type LogWrite struct {
logCh chan interface{}
logName string
logSize int64
seq int
logSufixx string
logFp *os.File
sigCh chan bool
bf *bytes.Buffer
jsonEncoder *json.Encoder
}
func (lw *LogWrite) initLogFp(sufixx string) (err error) {
var fi os.FileInfo
lw.seq = 0
lw.sigCh = make(chan bool, 1)
lw.logSufixx = sufixx
lw.logName = fmt.Sprintf("%s%s%s", UmpDataDir, "ump_", lw.logSufixx)
lw.bf = bytes.NewBuffer([]byte{})
lw.jsonEncoder = json.NewEncoder(lw.bf)
lw.jsonEncoder.SetEscapeHTML(false)
if lw.logFp, err = os.OpenFile(lw.logName, LogFileOpt, 0o666); err != nil {
return
}
if fi, err = lw.logFp.Stat(); err != nil {
return
}
lw.logSize = fi.Size()
return
}
func (lw *LogWrite) backGroundCheckFile() (err error) {
if lw.logSize <= MaxLogSize {
return
}
lw.logFp.Close()
lw.seq++
if lw.seq > 3 {
lw.seq = 1
}
name := fmt.Sprintf("%s%s%s.%d", UmpDataDir, "ump_", lw.logSufixx, lw.seq)
if _, err = os.Stat(name); err == nil {
os.Remove(name)
}
os.Rename(lw.logName, name)
if lw.logFp, err = os.OpenFile(lw.logName, LogFileOpt, 0o666); err != nil {
lw.seq--
return
}
if err = os.Truncate(lw.logName, 0); err != nil {
lw.seq--
return
}
lw.logSize = 0
return
}
func (lw *LogWrite) backGroundWrite(umpType string) {
for {
var body []byte
obj := <-lw.logCh
switch umpType {
case FunctionTpType:
tp := obj.(*FunctionTp)
lw.jsonEncoder.Encode(tp)
body = append(body, lw.bf.Bytes()...)
lw.bf.Reset()
FunctionTpPool.Put(tp)
case SystemAliveType:
alive := obj.(*SystemAlive)
lw.jsonEncoder.Encode(alive)
body = append(body, lw.bf.Bytes()...)
lw.bf.Reset()
SystemAlivePool.Put(alive)
case BusinessAlarmType:
alarm := obj.(*BusinessAlarm)
lw.jsonEncoder.Encode(alarm)
body = append(body, lw.bf.Bytes()...)
lw.bf.Reset()
AlarmPool.Put(alarm)
default:
// do nothing
}
if lw.backGroundCheckFile() != nil {
continue
}
body = append(body, []byte("\n")...)
lw.logFp.Write(body)
lw.logSize += (int64)(len(body))
}
}
func initLogName(module, dataDir string) (err error) {
if dataDir != "" {
UmpDataDir = dataDir
if !strings.HasSuffix(UmpDataDir, "/") {
UmpDataDir += "/"
}
} else {
return fmt.Errorf("warnLogDir dir not config")
}
if err = os.MkdirAll(UmpDataDir, 0o755); err != nil {
return
}
if HostName, err = GetLocalIpAddr(); err != nil {
return
}
if err = FunctionTpLogWrite.initLogFp(module + "_" + FunctionTpSufixx); err != nil {
return
}
if err = SystemAliveLogWrite.initLogFp(module + "_" + SystemAliveSufixx); err != nil {
return
}
if err = BusinessAlarmLogWrite.initLogFp(module + "_" + BusinessAlarmSufixx); err != nil {
return
}
return
}
func GetLocalIpAddr() (localAddr string, err error) {
addrs, err := net.InterfaceAddrs()
if err != nil {
return
}
for _, addr := range addrs {
if ipNet, isIpNet := addr.(*net.IPNet); isIpNet && !ipNet.IP.IsLoopback() {
if ipv4 := ipNet.IP.To4(); ipv4 != nil {
localAddr = ipv4.String()
return
}
}
}
err = fmt.Errorf("cannot get local ip")
return
}
func backGroudWrite() {
go FunctionTpLogWrite.backGroundWrite(FunctionTpType)
go SystemAliveLogWrite.backGroundWrite(SystemAliveType)
go BusinessAlarmLogWrite.backGroundWrite(BusinessAlarmType)
}
// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package util
import (
"crypto/md5"
"encoding/hex"
"fmt"
"net"
"regexp"
"strings"
"github.com/cubefs/cubefs/depends/tiglabs/raft/util"
"github.com/cubefs/cubefs/util/log"
)
const (
_ = iota
KB = 1 << (10 * iota)
MB
GB
TB
PB
DefaultDataPartitionSize = 120 * GB
TaskWorkerInterval = 1
)
const (
BlockCount = 1024
BlockSize = 65536 * 2
ReadBlockSize = BlockSize
PerBlockCrcSize = 4
ExtentSize = BlockCount * BlockSize
PacketHeaderSize = 57
BlockHeaderSize = 4096
SyscallTryMaxTimes = 3
PacketHeaderVerSize = 65
)
const (
PageSize = 4 * util.KB
FallocFLKeepSize = 1
FallocFLPunchHole = 2
)
const (
AclListIP = 0
AclAddIP = 1
AclDelIP = 2
AclCheckIP = 3
)
const (
UidLimitList = 0
UidAddLimit = 1
UidDel = 2
UidGetLimit = 3
)
const (
DefaultTinySizeLimit = 1 * MB // TODO explain tiny extent?
)
type MultiVersionSeq uint64
func Min(a, b int) int {
if a > b {
return b
}
return a
}
func Max(a, b int) int {
if a > b {
return a
}
return b
}
// IsIPV4 returns if it is IPV4 address.
func IsIPV4(val interface{}) bool {
ip4Pattern := `((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)`
ip4 := regexpCompile(ip4Pattern)
return isMatch(ip4, val)
}
func GetIp(addr string) (ip string) {
var arr []string
if arr = strings.Split(addr, ":"); len(arr) < 2 {
return
}
ip = strings.Trim(arr[0], " ")
return ip
}
func getIpAndPort(ipAddr string) (ip string, port string, success bool) {
success = false
arr := strings.Split(ipAddr, ":")
if len(arr) != 2 {
log.LogWarnf("action[GetIpAndPort] ipAddr[%v] invalid", ipAddr)
return
}
ip = strings.Trim(arr[0], " ")
port = strings.Trim(arr[1], " ")
success = true
return
}
func getDomainAndPort(domainAddr string) (domain string, port string, success bool) {
success = false
arr := strings.Split(domainAddr, ":")
if len(arr) != 2 {
log.LogWarnf("action[GetDomainAndPort] domainAddr[%v] invalid", domainAddr)
return
}
domain = strings.Trim(arr[0], " ")
port = strings.Trim(arr[1], " ")
success = true
return
}
func IsIPV4Addr(ipAddr string) bool {
ip, _, ok := getIpAndPort(ipAddr)
if !ok {
return false
}
return IsIPV4(ip)
}
func ParseIpAddrToDomainAddr(ipAddr string) (domainAddr string) {
ip, port, ok := getIpAndPort(ipAddr)
if !ok {
return
}
domains, err := net.LookupAddr(ip)
if err != nil {
log.LogWarnf("action[ParseIpAddrToDomainAddr] failed, ipAddr[%v], ip[%v], err[%v]", ipAddr, ip, err)
return
}
for _, v := range domains {
domain := v
if domain[len(domain)-1] == '.' {
domain = domain[0 : len(domain)-1]
}
if len(domainAddr) != 0 {
domainAddr += ","
}
domainAddr += fmt.Sprintf("%s:%v", domain, port)
}
return
}
func ParseAddrToIpAddr(addr string) (ipAddr string, success bool) {
success = true
if IsIPV4Addr(addr) {
ipAddr = addr
return
}
if parsedAddr, ok := ParseDomainAddrToIpAddr(addr); ok {
ipAddr = parsedAddr
return
}
success = false
return
}
func ParseDomainAddrToIpAddr(domainAddr string) (ipAddr string, success bool) {
success = false
domain, port, ok := getDomainAndPort(domainAddr)
if !ok {
return
}
ips, err := net.LookupHost(domain)
if err != nil {
log.LogWarnf("action[ParseDomainAddrToIpAddr] failed, domainAddr[%v], domain[%v], err[%v]",
domainAddr, domain, err)
return
}
if len(ips) == 0 {
log.LogWarnf("action[ParseDomainAddrToIpAddr] ips is null, domainAddr[%v], domain[%v]",
domainAddr, domain)
return
}
for i := 0; i < len(ips); i += 1 {
if ips[i] != ips[0] {
log.LogWarnf("action[ParseDomainAddrToIpAddr] the number of ips is not one,"+
"domainAddr[%v], domain[%v], ips[%v], err[%v]", domainAddr, domain, ips, err)
return
}
}
ipAddr = fmt.Sprintf("%s:%v", ips[0], port)
success = true
return
}
func regexpCompile(str string) *regexp.Regexp {
return regexp.MustCompile("^" + str + "$")
}
func isMatch(exp *regexp.Regexp, val interface{}) bool {
switch v := val.(type) {
case []rune:
return exp.MatchString(string(v))
case []byte:
return exp.Match(v)
case string:
return exp.MatchString(v)
default:
return false
}
}
func GenerateKey(volName string, ino uint64, offset uint64) string {
return fmt.Sprintf("%v_%v_%016x", volName, ino, offset)
}
func GenerateRepVolKey(volName string, ino uint64, dpId uint64, extentId uint64, offset uint64) string {
return fmt.Sprintf("%v_%v_%v_%v_%016x", volName, ino, dpId, extentId, offset)
}
func OneDaySec() int64 {
return 60 * 60 * 24
}
func CalcAuthKey(key string) (authKey string) {
h := md5.New()
_, _ = h.Write([]byte(key))
cipherStr := h.Sum(nil)
return strings.ToLower(hex.EncodeToString(cipherStr))
}