Newer
Older
/*
* fs/dax.c - Direct Access filesystem code
* Copyright (c) 2013-2014 Intel Corporation
* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/genhd.h>

Matthew Wilcox
committed
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/sched.h>

Ingo Molnar
committed
#include <linux/sched/signal.h>
#include <linux/uio.h>

Matthew Wilcox
committed
#include <linux/vmstat.h>
#include <linux/sizes.h>
#include <linux/iomap.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
static inline unsigned int pe_order(enum page_entry_size pe_size)
{
if (pe_size == PE_SIZE_PTE)
return PAGE_SHIFT - PAGE_SHIFT;
if (pe_size == PE_SIZE_PMD)
return PMD_SHIFT - PAGE_SHIFT;
if (pe_size == PE_SIZE_PUD)
return PUD_SHIFT - PAGE_SHIFT;
return ~0;
}
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
/* The 'colour' (ie low bits) within a PMD of a page offset. */
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
/* The order of a PMD entry */
#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
{
int i;
for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
init_waitqueue_head(wait_table + i);
return 0;
}
fs_initcall(init_dax_wait_table);
* DAX pagecache entries use XArray value entries so they can't be mistaken
* for pages. We use one bit for locking, one bit for the entry size (PMD)
* and two more to tell us if the entry is a zero page or an empty entry that
* is just used for locking. In total four special bits.
*
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
* and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
* block allocation.
*/
#define DAX_SHIFT (4)
#define DAX_LOCKED (1UL << 0)
#define DAX_PMD (1UL << 1)
#define DAX_ZERO_PAGE (1UL << 2)
#define DAX_EMPTY (1UL << 3)
return xa_to_value(entry) >> DAX_SHIFT;
static void *dax_make_locked(unsigned long pfn, unsigned long flags)
return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) |
DAX_LOCKED);
static bool dax_is_locked(void *entry)
{
return xa_to_value(entry) & DAX_LOCKED;
}
static unsigned int dax_entry_order(void *entry)
static int dax_is_pmd_entry(void *entry)
return xa_to_value(entry) & DAX_PMD;
static int dax_is_pte_entry(void *entry)
return !(xa_to_value(entry) & DAX_PMD);
static int dax_is_zero_entry(void *entry)
return xa_to_value(entry) & DAX_ZERO_PAGE;
static int dax_is_empty_entry(void *entry)
return xa_to_value(entry) & DAX_EMPTY;
wait_queue_entry_t wait;
static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa,
pgoff_t index, void *entry, struct exceptional_entry_key *key)
{
unsigned long hash;
/*
* If 'entry' is a PMD, align the 'index' that we use for the wait
* queue to the start of that PMD. This ensures that all offsets in
* the range covered by the PMD map to the same bit lock.
*/
key->entry_start = index;
hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS);
return wait_table + hash;
}
static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
unsigned int mode, int sync, void *keyp)
{
struct exceptional_entry_key *key = keyp;
struct wait_exceptional_entry_queue *ewait =
container_of(wait, struct wait_exceptional_entry_queue, wait);
key->entry_start != ewait->key.entry_start)
return 0;
return autoremove_wake_function(wait, mode, sync, NULL);
}
* @entry may no longer be the entry at the index in the mapping.
* The important information it's conveying is whether the entry at
* this index used to be a PMD entry.
static void dax_wake_mapping_entry_waiter(struct xarray *xa,
pgoff_t index, void *entry, bool wake_all)
{
struct exceptional_entry_key key;
wait_queue_head_t *wq;
wq = dax_entry_waitqueue(xa, index, entry, &key);
/*
* Checking for locked entry and prepare_to_wait_exclusive() happens
* under the i_pages lock, ditto for entry handling in our callers.
* So at this point all tasks that could have seen our entry locked
* must be in the waitqueue and the following check will see them.
*/
if (waitqueue_active(wq))
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
}
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all)
{
return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry,
wake_all);
}
/*
* Look up entry in page cache, wait for it to become unlocked if it
* is a DAX entry and return it. The caller must subsequently call
* put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
* if it did.
*
* Must be called with the i_pages lock held.
*/
static void *get_unlocked_entry(struct xa_state *xas)
{
void *entry;
struct wait_exceptional_entry_queue ewait;
wait_queue_head_t *wq;
init_wait(&ewait.wait);
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
entry = xas_load(xas);
if (!entry || xa_is_internal(entry) ||
WARN_ON_ONCE(!xa_is_value(entry)) ||
!dax_is_locked(entry))
return entry;
wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry,
&ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
xas_unlock_irq(xas);
xas_reset(xas);
schedule();
finish_wait(wq, &ewait.wait);
xas_lock_irq(xas);
}
}
static void put_unlocked_entry(struct xa_state *xas, void *entry)
{
/* If we were the only waiter woken, wake the next one */
if (entry)
dax_wake_entry(xas, entry, false);
}
/*
* We used the xa_state to get the entry, but then we locked the entry and
* dropped the xa_lock, so we know the xa_state is stale and must be reset
* before use.
*/
static void dax_unlock_entry(struct xa_state *xas, void *entry)
{
void *old;
xas_reset(xas);
xas_lock_irq(xas);
old = xas_store(xas, entry);
xas_unlock_irq(xas);
BUG_ON(!dax_is_locked(old));
dax_wake_entry(xas, entry, false);
}
/*
* Return: The entry stored at this location before it was locked.
*/
static void *dax_lock_entry(struct xa_state *xas, void *entry)
{
unsigned long v = xa_to_value(entry);
return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
}
* Check whether the given slot is locked. Must be called with the i_pages
* lock held.
*/
static inline int slot_locked(struct address_space *mapping, void **slot)
{
unsigned long entry = xa_to_value(
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
return entry & DAX_LOCKED;
* Mark the given slot as locked. Must be called with the i_pages lock held.
*/
static inline void *lock_slot(struct address_space *mapping, void **slot)
{
unsigned long v = xa_to_value(
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
void *entry = xa_mk_value(v | DAX_LOCKED);
radix_tree_replace_slot(&mapping->i_pages, slot, entry);
return entry;
* Mark the given slot as unlocked. Must be called with the i_pages lock held.
*/
static inline void *unlock_slot(struct address_space *mapping, void **slot)
{
unsigned long v = xa_to_value(
radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock));
void *entry = xa_mk_value(v & ~DAX_LOCKED);
radix_tree_replace_slot(&mapping->i_pages, slot, entry);
return entry;
* Lookup entry in page cache, wait for it to become unlocked if it is
* a DAX entry and return it. The caller must call
* put_unlocked_mapping_entry() when he decided not to lock the entry or
* put_locked_mapping_entry() when he locked the entry and now wants to
* unlock it.
*
static void *__get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp, bool (*wait_fn)(void))
init_wait(&ewait.wait);
ewait.wait.func = wake_exceptional_entry_func;
for (;;) {
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
WARN_ON_ONCE(!xa_is_value(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
wq = dax_entry_waitqueue(&mapping->i_pages, index, entry,
&ewait.key);
prepare_to_wait_exclusive(wq, &ewait.wait,
TASK_UNINTERRUPTIBLE);
revalidate = wait_fn();
if (revalidate)
return ERR_PTR(-EAGAIN);
static bool entry_wait(void)
{
schedule();
/*
* Never return an ERR_PTR() from
* __get_unlocked_mapping_entry(), just keep looping.
*/
return false;
}
static void *get_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void ***slotp)
{
return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
}
static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
xa_lock_irq(&mapping->i_pages);
entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot);
if (WARN_ON_ONCE(!entry || !xa_is_value(entry) ||
!slot_locked(mapping, slot))) {
return;
}
unlock_slot(mapping, slot);
dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false);
static void put_locked_mapping_entry(struct address_space *mapping,
unlock_mapping_entry(mapping, index);
* Called when we are done with page cache entry we looked up via
* get_unlocked_mapping_entry() and which we didn't lock in the end.
*/
static void put_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* We have to wake up next waiter for the page cache entry lock */
dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false);
static unsigned long dax_entry_size(void *entry)
{
if (dax_is_zero_entry(entry))
return 0;
else if (dax_is_empty_entry(entry))
return 0;
else if (dax_is_pmd_entry(entry))
return PMD_SIZE;
else
return PAGE_SIZE;
}
static unsigned long dax_end_pfn(void *entry)
{
return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
}
/*
* Iterate through all mapped pfns represented by an entry, i.e. skip
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
/*
* TODO: for reflink+dax we need a way to associate a single page with
* multiple address_space instances at different linear_page_index()
* offsets.
*/
static void dax_associate_entry(void *entry, struct address_space *mapping,
struct vm_area_struct *vma, unsigned long address)
{
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;
index = linear_page_index(vma, address & ~(size - 1));
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
}
}
static void dax_disassociate_entry(void *entry, struct address_space *mapping,
bool trunc)
{
unsigned long pfn;
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return;
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
}
}
static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
if (page_ref_count(page) > 1)
return page;
}
return NULL;
}
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
static bool entry_wait_revalidate(void)
{
rcu_read_unlock();
schedule();
rcu_read_lock();
/*
* Tell __get_unlocked_mapping_entry() to take a break, we need
* to revalidate page->mapping after dropping locks
*/
return true;
}
bool dax_lock_mapping_entry(struct page *page)
{
pgoff_t index;
struct inode *inode;
bool did_lock = false;
void *entry = NULL, **slot;
struct address_space *mapping;
rcu_read_lock();
for (;;) {
mapping = READ_ONCE(page->mapping);
if (!dax_mapping(mapping))
break;
/*
* In the device-dax case there's no need to lock, a
* struct dev_pagemap pin is sufficient to keep the
* inode alive, and we assume we have dev_pagemap pin
* otherwise we would not have a valid pfn_to_page()
* translation.
*/
inode = mapping->host;
if (S_ISCHR(inode->i_mode)) {
did_lock = true;
break;
}
xa_lock_irq(&mapping->i_pages);
if (mapping != page->mapping) {
xa_unlock_irq(&mapping->i_pages);
continue;
}
index = page->index;
entry = __get_unlocked_mapping_entry(mapping, index, &slot,
entry_wait_revalidate);
if (!entry) {
xa_unlock_irq(&mapping->i_pages);
break;
} else if (IS_ERR(entry)) {
WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
continue;
}
lock_slot(mapping, slot);
did_lock = true;
xa_unlock_irq(&mapping->i_pages);
break;
}
rcu_read_unlock();
return did_lock;
}
void dax_unlock_mapping_entry(struct page *page)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
if (S_ISCHR(inode->i_mode))
return;
unlock_mapping_entry(mapping, page->index);
}
* Find page cache entry at given index. If it is a DAX entry, return it
* with the entry locked. If the page cache doesn't contain an entry at
* that index, add a locked empty entry.
* When requesting an entry with size DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
* happen if there are any 4k entries within the 2MiB range that we are
* requesting.
*
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
* insertion will fail if it finds any 4k entries already in the tree, and a
* 4k insertion will cause an existing 2MiB entry to be unmapped and
* downgraded to 4k entries. This happens for both 2MiB huge zero pages as
* well as 2MiB empty entries.
*
* The exception to this downgrade path is for 2MiB DAX PMD entries that have
* real storage backing them. We will leave these real 2MiB DAX entries in
* the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
*
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
* persistent memory the benefit is doubtful. We can add that later if we can
* show it helps.
*/
static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
unsigned long size_flag)
bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
entry = get_unlocked_mapping_entry(mapping, index, &slot);
if (WARN_ON_ONCE(entry && !xa_is_value(entry))) {
entry = ERR_PTR(-EIO);
goto out_unlock;
}
if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index,
entry);
entry = ERR_PTR(-EEXIST);
goto out_unlock;
}
} else { /* trying to grab a PTE entry */
if (dax_is_pmd_entry(entry) &&
(dax_is_zero_entry(entry) ||
dax_is_empty_entry(entry))) {
pmd_downgrade = true;
}
}
}
/* No entry for given index? Make sure radix tree is big enough. */
if (!entry || pmd_downgrade) {
if (pmd_downgrade) {
/*
* Make sure 'entry' remains valid while we drop
*/
entry = lock_slot(mapping, slot);
}
/*
* Besides huge zero pages the only other thing that gets
* downgraded are empty entries which don't need to be
* unmapped.
*/
if (pmd_downgrade && dax_is_zero_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
PG_PMD_NR, false);
err = radix_tree_preload(
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
if (err) {
if (pmd_downgrade)
put_locked_mapping_entry(mapping, index);
* We needed to drop the i_pages lock while calling
* radix_tree_preload() and we didn't have an entry to
* lock. See if another thread inserted an entry at
* our index during this time.
*/
entry = __radix_tree_lookup(&mapping->i_pages, index,
NULL, &slot);
if (entry) {
radix_tree_preload_end();
dax_disassociate_entry(entry, mapping, false);
dax_wake_mapping_entry_waiter(&mapping->i_pages,
index, entry, true);
entry = dax_make_locked(0, size_flag | DAX_EMPTY);
err = __radix_tree_insert(&mapping->i_pages, index,
* Our insertion of a DAX entry failed, most likely
* because we were inserting a PMD entry and it
* collided with a PTE sized entry at a different
* index in the PMD range. We haven't inserted
* anything into the radix tree and have no waiters to
* wake.
return ERR_PTR(err);
}
/* Good, we have inserted empty locked entry into the tree. */
mapping->nrexceptional++;
entry = lock_slot(mapping, slot);
/**
* dax_layout_busy_page - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
* page->count == 1. A filesystem uses this interface to determine if
* any page in the mapping is busy, i.e. for DMA, or other
* get_user_pages() usages.
*
* It is expected that the filesystem is holding locks to block the
* establishment of new mappings in this address_space. I.e. it expects
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
void *entry;
unsigned int scanned = 0;
struct page *page = NULL;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
*/
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return NULL;
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the iteration and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
* holding those locks, and unmap_mapping_range() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
unmap_mapping_range(mapping, 0, 0, 1);
xas_lock_irq(&xas);
xas_for_each(&xas, entry, ULONG_MAX) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
if (unlikely(dax_is_locked(entry)))
entry = get_unlocked_entry(&xas);
if (entry)
page = dax_busy_page(entry);
put_unlocked_entry(&xas, entry);
if (page)
break;
if (++scanned % XA_CHECK_SCHED)
continue;
xas_pause(&xas);
xas_unlock_irq(&xas);
cond_resched();
xas_lock_irq(&xas);
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
xas_lock_irq(&xas);
entry = get_unlocked_entry(&xas);
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
(xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
dax_disassociate_entry(entry, mapping, trunc);
mapping->nrexceptional--;
ret = 1;
out:
put_unlocked_entry(&xas, entry);
xas_unlock_irq(&xas);
* Delete DAX entry at @index from @mapping. Wait for it
* to be unlocked before deleting it.
*/
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{
int ret = __dax_invalidate_entry(mapping, index, true);
/*
* This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the
* page cache (usually fs-private i_mmap_sem for writing). Since the
* caller has seen a DAX entry for this index, we better find it
WARN_ON_ONCE(!ret);
return ret;
}
/*
* Invalidate DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
return __dax_invalidate_entry(mapping, index, false);
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
sector_t sector, size_t size, struct page *to,
unsigned long vaddr)

Matthew Wilcox
committed
{
void *vto, *kaddr;
pgoff_t pgoff;
long rc;
int id;
rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}

Matthew Wilcox
committed
vto = kmap_atomic(to);
copy_user_page(vto, (void __force *)kaddr, vaddr, to);

Matthew Wilcox
committed
kunmap_atomic(vto);

Matthew Wilcox
committed
return 0;
}
/*
* By this point grab_mapping_entry() has ensured that we have a locked entry
* of the appropriate size so we don't have to worry about downgrading PMDs to
* PTEs. If we happen to be trying to insert a PTE and there is a PMD
* already in the tree, we will skip the insertion and just dirty the PMD as
* appropriate.
*/
static void *dax_insert_entry(struct address_space *mapping,
struct vm_fault *vmf,
void *entry, pfn_t pfn_t, unsigned long flags, bool dirty)
struct radix_tree_root *pages = &mapping->i_pages;
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) {
/* we are replacing a zero page with block mapping */
if (dax_is_pmd_entry(entry))
unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
PG_PMD_NR, false);
unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
dax_disassociate_entry(entry, mapping, false);
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address);
}
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
* PMD entry is already in the cache, we leave it alone. This
* means that if we are trying to insert a PTE and the
* existing entry is a PMD, we will just leave the PMD in the
* tree and dirty it if necessary.
*/
struct radix_tree_node *node;
ret = __radix_tree_lookup(pages, index, &node, &slot);
new_entry, NULL);
if (dirty)
radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY);
static inline
unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
{
unsigned long address;
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
return address;
}
/* Walk all mappings of a given index of a file and writeprotect them */
static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
unsigned long pfn)
pte_t pte, *ptep = NULL;
pmd_t *pmdp = NULL;
spinlock_t *ptl;
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
unsigned long address, start, end;
cond_resched();
if (!(vma->vm_flags & VM_SHARED))
continue;
address = pgoff_address(index, vma);
/*
* Note because we provide start/end to follow_pte_pmd it will
* call mmu_notifier_invalidate_range_start() on our behalf
* before taking any lock.
*/
if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
/*
* No need to call mmu_notifier_invalidate_range() as we are
* downgrading page table protection not changing it to point
* to a new page.
*
* See Documentation/vm/mmu_notifier.rst
if (pmdp) {
#ifdef CONFIG_FS_DAX_PMD
pmd_t pmd;
if (pfn != pmd_pfn(*pmdp))
goto unlock_pmd;

Linus Torvalds
committed
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
flush_cache_page(vma, address, pfn);
pmd = pmdp_huge_clear_flush(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
unlock_pmd:
#endif
spin_unlock(ptl);
} else {
if (pfn != pte_pfn(*ptep))
goto unlock_pte;
if (!pte_dirty(*ptep) && !pte_write(*ptep))
goto unlock_pte;
flush_cache_page(vma, address, pfn);
pte = ptep_clear_flush(vma, address, ptep);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
}
static int dax_writeback_one(struct dax_device *dax_dev,