Files
linux/drivers/md/dm-pcache/backing_dev.h
Dongsheng Yang 1d57628ff9 dm-pcache: add persistent cache target in device-mapper
This patch introduces dm-pcache, a new DM target that places a DAX-
capable persistent-memory device in front of any slower block device and
uses it as a high-throughput, low-latency  cache.

Design highlights
-----------------
- DAX data path – data is copied directly between DRAM and the pmem
  mapping, bypassing the block layer’s overhead.

- Segmented, crash-consistent layout
  - all layout metadata are dual-replicated CRC-protected.
  - atomic kset flushes; key replay on mount guarantees cache integrity
    even after power loss.

- Striped multi-tree index
  - Multi‑tree indexing for high parallelism.
  - overlap-resolution logic ensures non-intersecting cached extents.

- Background services
  - write-back worker flushes dirty keys in order, preserving backing-device
    crash consistency. This is important for checkpoint in cloud storage.
  - garbage collector reclaims clean segments when utilisation exceeds a
    tunable threshold.

- Data integrity – optional CRC32 on cached payload; metadata always protected.

Comparison with existing block-level caches
---------------------------------------------------------------------------------------------------------------------------------
| Feature                          | pcache (this patch)             | bcache                       | dm-writecache             |
|----------------------------------|---------------------------------|------------------------------|---------------------------|
| pmem access method               | DAX                             | bio (block I/O)              | DAX                       |
| Write latency (4 K rand-write)   | ~5 µs                           | ~20 µs                       | ~5 µs                     |
| Concurrency                      | multi subtree index             | global index tree            | single tree + wc_lock     |
| IOPS (4K randwrite, 32 numjobs)  | 2.1 M                           | 352 K                        | 283 K                     |
| Read-cache support               | YES                             | YES                          | NO                        |
| Deployment                       | no re-format of backend         | backend devices must be      | no re-format of backend   |
|                                  |                                 | reformatted                  |                           |
| Write-back ordering              | log-structured;                 | no ordering guarantee        | no ordering guarantee     |
|                                  | preserves app-IO-order          |                              |                           |
| Data integrity checks            | metadata + data CRC(optional)   | metadata CRC only            | none                      |
---------------------------------------------------------------------------------------------------------------------------------

Signed-off-by: Dongsheng Yang <dongsheng.yang@linux.dev>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
2025-08-25 15:25:29 +02:00

128 lines
2.9 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _BACKING_DEV_H
#define _BACKING_DEV_H
#include <linux/device-mapper.h>
#include "pcache_internal.h"
struct pcache_backing_dev_req;
typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret);
#define BACKING_DEV_REQ_TYPE_REQ 1
#define BACKING_DEV_REQ_TYPE_KMEM 2
#define BACKING_DEV_REQ_INLINE_BVECS 4
struct pcache_request;
struct pcache_backing_dev_req {
u8 type;
struct bio bio;
struct pcache_backing_dev *backing_dev;
void *priv_data;
backing_req_end_fn_t end_req;
struct list_head node;
int ret;
union {
struct {
struct pcache_request *upper_req;
u32 bio_off;
} req;
struct {
struct bio_vec inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS];
struct bio_vec *bvecs;
u32 n_vecs;
} kmem;
};
};
struct pcache_backing_dev {
struct pcache_cache *cache;
struct dm_dev *dm_dev;
mempool_t req_pool;
mempool_t bvec_pool;
struct list_head submit_list;
spinlock_t submit_lock;
struct work_struct req_submit_work;
struct list_head complete_list;
spinlock_t complete_lock;
struct work_struct req_complete_work;
atomic_t inflight_reqs;
wait_queue_head_t inflight_wq;
u64 dev_size;
};
struct dm_pcache;
int backing_dev_start(struct dm_pcache *pcache);
void backing_dev_stop(struct dm_pcache *pcache);
struct pcache_backing_dev_req_opts {
u32 type;
union {
struct {
struct pcache_request *upper_req;
u32 req_off;
u32 len;
} req;
struct {
void *data;
blk_opf_t opf;
u32 len;
u64 backing_off;
} kmem;
};
gfp_t gfp_mask;
backing_req_end_fn_t end_fn;
void *priv_data;
};
static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len)
{
const void *p = data;
u32 done = 0, in_page, to_advance;
struct page *first_page, *next_page;
if (!is_vmalloc_addr(data))
return len;
first_page = vmalloc_to_page(p);
advance:
in_page = PAGE_SIZE - offset_in_page(p);
to_advance = min_t(u32, in_page, len - done);
done += to_advance;
p += to_advance;
if (done == len)
return done;
next_page = vmalloc_to_page(p);
if (zone_device_pages_have_same_pgmap(first_page, next_page))
goto advance;
return done;
}
void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct);
void backing_dev_req_end(struct pcache_backing_dev_req *backing_req);
struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
struct pcache_backing_dev_req_opts *opts);
struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
struct pcache_backing_dev_req_opts *opts);
void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
struct pcache_backing_dev_req_opts *opts);
void backing_dev_flush(struct pcache_backing_dev *backing_dev);
int pcache_backing_init(void);
void pcache_backing_exit(void);
#endif /* _BACKING_DEV_H */